avx512.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /* -*- linux-c -*- --------------------------------------------------------
  3. *
  4. * Copyright (C) 2016 Intel Corporation
  5. *
  6. * Author: Gayatri Kammela <gayatri.kammela@intel.com>
  7. * Author: Megha Dey <megha.dey@linux.intel.com>
  8. *
  9. * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
  10. * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  11. *
  12. * -----------------------------------------------------------------------
  13. */
  14. /*
  15. * AVX512 implementation of RAID-6 syndrome functions
  16. *
  17. */
  18. #include <linux/raid/pq.h>
  19. #include "x86.h"
  20. static const struct raid6_avx512_constants {
  21. u64 x1d[8];
  22. } raid6_avx512_constants __aligned(512/8) = {
  23. { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  24. 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  25. 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  26. 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
  27. };
  28. static int raid6_have_avx512(void)
  29. {
  30. return boot_cpu_has(X86_FEATURE_AVX2) &&
  31. boot_cpu_has(X86_FEATURE_AVX) &&
  32. boot_cpu_has(X86_FEATURE_AVX512F) &&
  33. boot_cpu_has(X86_FEATURE_AVX512BW) &&
  34. boot_cpu_has(X86_FEATURE_AVX512VL) &&
  35. boot_cpu_has(X86_FEATURE_AVX512DQ);
  36. }
  37. static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
  38. {
  39. u8 **dptr = (u8 **)ptrs;
  40. u8 *p, *q;
  41. int d, z, z0;
  42. z0 = disks - 3; /* Highest data disk */
  43. p = dptr[z0+1]; /* XOR parity */
  44. q = dptr[z0+2]; /* RS syndrome */
  45. kernel_fpu_begin();
  46. asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  47. "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
  48. :
  49. : "m" (raid6_avx512_constants.x1d[0]));
  50. for (d = 0; d < bytes; d += 64) {
  51. asm volatile("prefetchnta %0\n\t"
  52. "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
  53. "prefetchnta %1\n\t"
  54. "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
  55. "vmovdqa64 %1,%%zmm6"
  56. :
  57. : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
  58. for (z = z0-2; z >= 0; z--) {
  59. asm volatile("prefetchnta %0\n\t"
  60. "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  61. "vpmovm2b %%k1,%%zmm5\n\t"
  62. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  63. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  64. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  65. "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  66. "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  67. "vmovdqa64 %0,%%zmm6"
  68. :
  69. : "m" (dptr[z][d]));
  70. }
  71. asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  72. "vpmovm2b %%k1,%%zmm5\n\t"
  73. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  74. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  75. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  76. "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  77. "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  78. "vmovntdq %%zmm2,%0\n\t"
  79. "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
  80. "vmovntdq %%zmm4,%1\n\t"
  81. "vpxorq %%zmm4,%%zmm4,%%zmm4"
  82. :
  83. : "m" (p[d]), "m" (q[d]));
  84. }
  85. asm volatile("sfence" : : : "memory");
  86. kernel_fpu_end();
  87. }
  88. static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
  89. size_t bytes, void **ptrs)
  90. {
  91. u8 **dptr = (u8 **)ptrs;
  92. u8 *p, *q;
  93. int d, z, z0;
  94. z0 = stop; /* P/Q right side optimization */
  95. p = dptr[disks-2]; /* XOR parity */
  96. q = dptr[disks-1]; /* RS syndrome */
  97. kernel_fpu_begin();
  98. asm volatile("vmovdqa64 %0,%%zmm0"
  99. : : "m" (raid6_avx512_constants.x1d[0]));
  100. for (d = 0 ; d < bytes ; d += 64) {
  101. asm volatile("vmovdqa64 %0,%%zmm4\n\t"
  102. "vmovdqa64 %1,%%zmm2\n\t"
  103. "vpxorq %%zmm4,%%zmm2,%%zmm2"
  104. :
  105. : "m" (dptr[z0][d]), "m" (p[d]));
  106. /* P/Q data pages */
  107. for (z = z0-1 ; z >= start ; z--) {
  108. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  109. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  110. "vpmovm2b %%k1,%%zmm5\n\t"
  111. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  112. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  113. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  114. "vmovdqa64 %0,%%zmm5\n\t"
  115. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  116. "vpxorq %%zmm5,%%zmm4,%%zmm4"
  117. :
  118. : "m" (dptr[z][d]));
  119. }
  120. /* P/Q left side optimization */
  121. for (z = start-1 ; z >= 0 ; z--) {
  122. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  123. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  124. "vpmovm2b %%k1,%%zmm5\n\t"
  125. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  126. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  127. "vpxorq %%zmm5,%%zmm4,%%zmm4"
  128. :
  129. : );
  130. }
  131. asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
  132. /* Don't use movntdq for r/w memory area < cache line */
  133. "vmovdqa64 %%zmm4,%0\n\t"
  134. "vmovdqa64 %%zmm2,%1"
  135. :
  136. : "m" (q[d]), "m" (p[d]));
  137. }
  138. asm volatile("sfence" : : : "memory");
  139. kernel_fpu_end();
  140. }
  141. const struct raid6_calls raid6_avx512x1 = {
  142. raid6_avx5121_gen_syndrome,
  143. raid6_avx5121_xor_syndrome,
  144. raid6_have_avx512,
  145. "avx512x1",
  146. .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
  147. };
  148. /*
  149. * Unrolled-by-2 AVX512 implementation
  150. */
  151. static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
  152. {
  153. u8 **dptr = (u8 **)ptrs;
  154. u8 *p, *q;
  155. int d, z, z0;
  156. z0 = disks - 3; /* Highest data disk */
  157. p = dptr[z0+1]; /* XOR parity */
  158. q = dptr[z0+2]; /* RS syndrome */
  159. kernel_fpu_begin();
  160. asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  161. "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
  162. :
  163. : "m" (raid6_avx512_constants.x1d[0]));
  164. /* We uniformly assume a single prefetch covers at least 64 bytes */
  165. for (d = 0; d < bytes; d += 128) {
  166. asm volatile("prefetchnta %0\n\t"
  167. "prefetchnta %1\n\t"
  168. "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
  169. "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
  170. "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
  171. "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
  172. :
  173. : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
  174. for (z = z0-1; z >= 0; z--) {
  175. asm volatile("prefetchnta %0\n\t"
  176. "prefetchnta %1\n\t"
  177. "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  178. "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
  179. "vpmovm2b %%k1,%%zmm5\n\t"
  180. "vpmovm2b %%k2,%%zmm7\n\t"
  181. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  182. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  183. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  184. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  185. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  186. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  187. "vmovdqa64 %0,%%zmm5\n\t"
  188. "vmovdqa64 %1,%%zmm7\n\t"
  189. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  190. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  191. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  192. "vpxorq %%zmm7,%%zmm6,%%zmm6"
  193. :
  194. : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
  195. }
  196. asm volatile("vmovntdq %%zmm2,%0\n\t"
  197. "vmovntdq %%zmm3,%1\n\t"
  198. "vmovntdq %%zmm4,%2\n\t"
  199. "vmovntdq %%zmm6,%3"
  200. :
  201. : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
  202. "m" (q[d+64]));
  203. }
  204. asm volatile("sfence" : : : "memory");
  205. kernel_fpu_end();
  206. }
  207. static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
  208. size_t bytes, void **ptrs)
  209. {
  210. u8 **dptr = (u8 **)ptrs;
  211. u8 *p, *q;
  212. int d, z, z0;
  213. z0 = stop; /* P/Q right side optimization */
  214. p = dptr[disks-2]; /* XOR parity */
  215. q = dptr[disks-1]; /* RS syndrome */
  216. kernel_fpu_begin();
  217. asm volatile("vmovdqa64 %0,%%zmm0"
  218. : : "m" (raid6_avx512_constants.x1d[0]));
  219. for (d = 0 ; d < bytes ; d += 128) {
  220. asm volatile("vmovdqa64 %0,%%zmm4\n\t"
  221. "vmovdqa64 %1,%%zmm6\n\t"
  222. "vmovdqa64 %2,%%zmm2\n\t"
  223. "vmovdqa64 %3,%%zmm3\n\t"
  224. "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
  225. "vpxorq %%zmm6,%%zmm3,%%zmm3"
  226. :
  227. : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
  228. "m" (p[d]), "m" (p[d+64]));
  229. /* P/Q data pages */
  230. for (z = z0-1 ; z >= start ; z--) {
  231. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  232. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  233. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  234. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  235. "vpmovm2b %%k1,%%zmm5\n\t"
  236. "vpmovm2b %%k2,%%zmm7\n\t"
  237. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  238. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  239. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  240. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  241. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  242. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  243. "vmovdqa64 %0,%%zmm5\n\t"
  244. "vmovdqa64 %1,%%zmm7\n\t"
  245. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  246. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  247. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  248. "vpxorq %%zmm7,%%zmm6,%%zmm6"
  249. :
  250. : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
  251. }
  252. /* P/Q left side optimization */
  253. for (z = start-1 ; z >= 0 ; z--) {
  254. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  255. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  256. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  257. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  258. "vpmovm2b %%k1,%%zmm5\n\t"
  259. "vpmovm2b %%k2,%%zmm7\n\t"
  260. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  261. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  262. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  263. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  264. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  265. "vpxorq %%zmm7,%%zmm6,%%zmm6"
  266. :
  267. : );
  268. }
  269. asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
  270. "vpxorq %1,%%zmm6,%%zmm6\n\t"
  271. /* Don't use movntdq for r/w
  272. * memory area < cache line
  273. */
  274. "vmovdqa64 %%zmm4,%0\n\t"
  275. "vmovdqa64 %%zmm6,%1\n\t"
  276. "vmovdqa64 %%zmm2,%2\n\t"
  277. "vmovdqa64 %%zmm3,%3"
  278. :
  279. : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
  280. "m" (p[d+64]));
  281. }
  282. asm volatile("sfence" : : : "memory");
  283. kernel_fpu_end();
  284. }
  285. const struct raid6_calls raid6_avx512x2 = {
  286. raid6_avx5122_gen_syndrome,
  287. raid6_avx5122_xor_syndrome,
  288. raid6_have_avx512,
  289. "avx512x2",
  290. .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
  291. };
  292. #ifdef CONFIG_X86_64
  293. /*
  294. * Unrolled-by-4 AVX2 implementation
  295. */
  296. static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
  297. {
  298. u8 **dptr = (u8 **)ptrs;
  299. u8 *p, *q;
  300. int d, z, z0;
  301. z0 = disks - 3; /* Highest data disk */
  302. p = dptr[z0+1]; /* XOR parity */
  303. q = dptr[z0+2]; /* RS syndrome */
  304. kernel_fpu_begin();
  305. asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  306. "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
  307. "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
  308. "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
  309. "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
  310. "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
  311. "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
  312. "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
  313. "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
  314. "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
  315. :
  316. : "m" (raid6_avx512_constants.x1d[0]));
  317. for (d = 0; d < bytes; d += 256) {
  318. for (z = z0; z >= 0; z--) {
  319. asm volatile("prefetchnta %0\n\t"
  320. "prefetchnta %1\n\t"
  321. "prefetchnta %2\n\t"
  322. "prefetchnta %3\n\t"
  323. "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  324. "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
  325. "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
  326. "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
  327. "vpmovm2b %%k1,%%zmm5\n\t"
  328. "vpmovm2b %%k2,%%zmm7\n\t"
  329. "vpmovm2b %%k3,%%zmm13\n\t"
  330. "vpmovm2b %%k4,%%zmm15\n\t"
  331. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  332. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  333. "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
  334. "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
  335. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  336. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  337. "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
  338. "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
  339. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  340. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  341. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  342. "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
  343. "vmovdqa64 %0,%%zmm5\n\t"
  344. "vmovdqa64 %1,%%zmm7\n\t"
  345. "vmovdqa64 %2,%%zmm13\n\t"
  346. "vmovdqa64 %3,%%zmm15\n\t"
  347. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  348. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  349. "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
  350. "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
  351. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  352. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  353. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  354. "vpxorq %%zmm15,%%zmm14,%%zmm14"
  355. :
  356. : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
  357. "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
  358. }
  359. asm volatile("vmovntdq %%zmm2,%0\n\t"
  360. "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
  361. "vmovntdq %%zmm3,%1\n\t"
  362. "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
  363. "vmovntdq %%zmm10,%2\n\t"
  364. "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
  365. "vmovntdq %%zmm11,%3\n\t"
  366. "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
  367. "vmovntdq %%zmm4,%4\n\t"
  368. "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
  369. "vmovntdq %%zmm6,%5\n\t"
  370. "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
  371. "vmovntdq %%zmm12,%6\n\t"
  372. "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
  373. "vmovntdq %%zmm14,%7\n\t"
  374. "vpxorq %%zmm14,%%zmm14,%%zmm14"
  375. :
  376. : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
  377. "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
  378. "m" (q[d+128]), "m" (q[d+192]));
  379. }
  380. asm volatile("sfence" : : : "memory");
  381. kernel_fpu_end();
  382. }
  383. static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
  384. size_t bytes, void **ptrs)
  385. {
  386. u8 **dptr = (u8 **)ptrs;
  387. u8 *p, *q;
  388. int d, z, z0;
  389. z0 = stop; /* P/Q right side optimization */
  390. p = dptr[disks-2]; /* XOR parity */
  391. q = dptr[disks-1]; /* RS syndrome */
  392. kernel_fpu_begin();
  393. asm volatile("vmovdqa64 %0,%%zmm0"
  394. :: "m" (raid6_avx512_constants.x1d[0]));
  395. for (d = 0 ; d < bytes ; d += 256) {
  396. asm volatile("vmovdqa64 %0,%%zmm4\n\t"
  397. "vmovdqa64 %1,%%zmm6\n\t"
  398. "vmovdqa64 %2,%%zmm12\n\t"
  399. "vmovdqa64 %3,%%zmm14\n\t"
  400. "vmovdqa64 %4,%%zmm2\n\t"
  401. "vmovdqa64 %5,%%zmm3\n\t"
  402. "vmovdqa64 %6,%%zmm10\n\t"
  403. "vmovdqa64 %7,%%zmm11\n\t"
  404. "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
  405. "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
  406. "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
  407. "vpxorq %%zmm14,%%zmm11,%%zmm11"
  408. :
  409. : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
  410. "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
  411. "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
  412. "m" (p[d+192]));
  413. /* P/Q data pages */
  414. for (z = z0-1 ; z >= start ; z--) {
  415. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  416. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  417. "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
  418. "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
  419. "prefetchnta %0\n\t"
  420. "prefetchnta %2\n\t"
  421. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  422. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  423. "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
  424. "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
  425. "vpmovm2b %%k1,%%zmm5\n\t"
  426. "vpmovm2b %%k2,%%zmm7\n\t"
  427. "vpmovm2b %%k3,%%zmm13\n\t"
  428. "vpmovm2b %%k4,%%zmm15\n\t"
  429. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  430. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  431. "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
  432. "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
  433. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  434. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  435. "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
  436. "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
  437. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  438. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  439. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  440. "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
  441. "vmovdqa64 %0,%%zmm5\n\t"
  442. "vmovdqa64 %1,%%zmm7\n\t"
  443. "vmovdqa64 %2,%%zmm13\n\t"
  444. "vmovdqa64 %3,%%zmm15\n\t"
  445. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  446. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  447. "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
  448. "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
  449. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  450. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  451. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  452. "vpxorq %%zmm15,%%zmm14,%%zmm14"
  453. :
  454. : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
  455. "m" (dptr[z][d+128]),
  456. "m" (dptr[z][d+192]));
  457. }
  458. asm volatile("prefetchnta %0\n\t"
  459. "prefetchnta %1\n\t"
  460. :
  461. : "m" (q[d]), "m" (q[d+128]));
  462. /* P/Q left side optimization */
  463. for (z = start-1 ; z >= 0 ; z--) {
  464. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  465. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  466. "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
  467. "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
  468. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  469. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  470. "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
  471. "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
  472. "vpmovm2b %%k1,%%zmm5\n\t"
  473. "vpmovm2b %%k2,%%zmm7\n\t"
  474. "vpmovm2b %%k3,%%zmm13\n\t"
  475. "vpmovm2b %%k4,%%zmm15\n\t"
  476. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  477. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  478. "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
  479. "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
  480. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  481. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  482. "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
  483. "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
  484. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  485. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  486. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  487. "vpxorq %%zmm15,%%zmm14,%%zmm14"
  488. :
  489. : );
  490. }
  491. asm volatile("vmovntdq %%zmm2,%0\n\t"
  492. "vmovntdq %%zmm3,%1\n\t"
  493. "vmovntdq %%zmm10,%2\n\t"
  494. "vmovntdq %%zmm11,%3\n\t"
  495. "vpxorq %4,%%zmm4,%%zmm4\n\t"
  496. "vpxorq %5,%%zmm6,%%zmm6\n\t"
  497. "vpxorq %6,%%zmm12,%%zmm12\n\t"
  498. "vpxorq %7,%%zmm14,%%zmm14\n\t"
  499. "vmovntdq %%zmm4,%4\n\t"
  500. "vmovntdq %%zmm6,%5\n\t"
  501. "vmovntdq %%zmm12,%6\n\t"
  502. "vmovntdq %%zmm14,%7"
  503. :
  504. : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
  505. "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
  506. "m" (q[d+128]), "m" (q[d+192]));
  507. }
  508. asm volatile("sfence" : : : "memory");
  509. kernel_fpu_end();
  510. }
  511. const struct raid6_calls raid6_avx512x4 = {
  512. raid6_avx5124_gen_syndrome,
  513. raid6_avx5124_xor_syndrome,
  514. raid6_have_avx512,
  515. "avx512x4",
  516. .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
  517. };
  518. #endif