crc-clmul-template.h 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /* Copyright 2025 Google LLC */
  3. /*
  4. * This file is a "template" that generates a CRC function optimized using the
  5. * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
  6. * file must define the following parameters to specify the type of CRC:
  7. *
  8. * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
  9. * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
  10. * mapping between bits and polynomial coefficients
  11. * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
  12. * mapping between bits and polynomial coefficients
  13. */
  14. #include <asm/byteorder.h>
  15. #include <linux/minmax.h>
  16. #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
  17. static inline unsigned long clmul(unsigned long a, unsigned long b)
  18. {
  19. unsigned long res;
  20. asm(".option push\n"
  21. ".option arch,+zbc\n"
  22. "clmul %0, %1, %2\n"
  23. ".option pop\n"
  24. : "=r" (res) : "r" (a), "r" (b));
  25. return res;
  26. }
  27. static inline unsigned long clmulh(unsigned long a, unsigned long b)
  28. {
  29. unsigned long res;
  30. asm(".option push\n"
  31. ".option arch,+zbc\n"
  32. "clmulh %0, %1, %2\n"
  33. ".option pop\n"
  34. : "=r" (res) : "r" (a), "r" (b));
  35. return res;
  36. }
  37. static inline unsigned long clmulr(unsigned long a, unsigned long b)
  38. {
  39. unsigned long res;
  40. asm(".option push\n"
  41. ".option arch,+zbc\n"
  42. "clmulr %0, %1, %2\n"
  43. ".option pop\n"
  44. : "=r" (res) : "r" (a), "r" (b));
  45. return res;
  46. }
  47. /*
  48. * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
  49. * polynomial whose bit order matches the CRC's bit order.
  50. */
  51. #ifdef CONFIG_64BIT
  52. # if LSB_CRC
  53. # define crc_load_long(x) le64_to_cpup(x)
  54. # else
  55. # define crc_load_long(x) be64_to_cpup(x)
  56. # endif
  57. #else
  58. # if LSB_CRC
  59. # define crc_load_long(x) le32_to_cpup(x)
  60. # else
  61. # define crc_load_long(x) be32_to_cpup(x)
  62. # endif
  63. #endif
  64. /* XOR @crc into the end of @msgpoly that represents the high-order terms. */
  65. static inline unsigned long
  66. crc_clmul_prep(crc_t crc, unsigned long msgpoly)
  67. {
  68. #if LSB_CRC
  69. return msgpoly ^ crc;
  70. #else
  71. return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
  72. #endif
  73. }
  74. /*
  75. * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
  76. * modulo the generator polynomial G. This gives the CRC of @msgpoly.
  77. */
  78. static inline crc_t
  79. crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
  80. {
  81. unsigned long tmp;
  82. /*
  83. * First step of Barrett reduction with integrated multiplication by
  84. * x^n: calculate floor((msgpoly * x^n) / G). This is the value by
  85. * which G needs to be multiplied to cancel out the x^n and higher terms
  86. * of msgpoly * x^n. Do it using the following formula:
  87. *
  88. * msb-first:
  89. * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
  90. * lsb-first:
  91. * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
  92. *
  93. * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
  94. * which fits a long exactly. Using any lower power of x there would
  95. * not carry enough precision through the calculation, while using any
  96. * higher power of x would require extra instructions to handle a wider
  97. * multiplication. In the msb-first case, using this power of x results
  98. * in needing a floored division by x^(BITS_PER_LONG-1), which matches
  99. * what clmulr produces. In the lsb-first case, a factor of x gets
  100. * implicitly introduced by each carryless multiplication (shown as
  101. * '* x' above), and the floored division instead needs to be by
  102. * x^BITS_PER_LONG which matches what clmul produces.
  103. */
  104. #if LSB_CRC
  105. tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
  106. #else
  107. tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
  108. #endif
  109. /*
  110. * Second step of Barrett reduction:
  111. *
  112. * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
  113. *
  114. * This reduces (msgpoly * x^n) modulo G by adding the appropriate
  115. * multiple of G to it. The result uses only the x^0..x^(n-1) terms.
  116. * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
  117. * terms in the first place, it is more efficient to do the equivalent:
  118. *
  119. * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
  120. *
  121. * In the lsb-first case further modify it to the following which avoids
  122. * a shift, as the crc ends up in the physically low n bits from clmulr:
  123. *
  124. * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
  125. * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
  126. *
  127. * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
  128. * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
  129. * cast of the result to crc_t is essential, as it applies the mod x^n!
  130. */
  131. #if LSB_CRC
  132. return clmulr(tmp, consts->barrett_reduction_const_2);
  133. #else
  134. return clmul(tmp, consts->barrett_reduction_const_2);
  135. #endif
  136. }
  137. /* Update @crc with the data from @msgpoly. */
  138. static inline crc_t
  139. crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
  140. const struct crc_clmul_consts *consts)
  141. {
  142. return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
  143. }
  144. /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
  145. static inline crc_t
  146. crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
  147. const struct crc_clmul_consts *consts)
  148. {
  149. unsigned long msgpoly;
  150. size_t i;
  151. #if LSB_CRC
  152. msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
  153. for (i = 1; i < len; i++)
  154. msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
  155. #else
  156. msgpoly = p[0];
  157. for (i = 1; i < len; i++)
  158. msgpoly = (msgpoly << 8) ^ p[i];
  159. #endif
  160. if (len >= sizeof(crc_t)) {
  161. #if LSB_CRC
  162. msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
  163. #else
  164. msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
  165. #endif
  166. return crc_clmul_long(msgpoly, consts);
  167. }
  168. #if LSB_CRC
  169. msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
  170. return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
  171. #else
  172. msgpoly ^= crc >> (CRC_BITS - 8*len);
  173. return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
  174. #endif
  175. }
  176. static inline crc_t
  177. crc_clmul(crc_t crc, const void *p, size_t len,
  178. const struct crc_clmul_consts *consts)
  179. {
  180. size_t align;
  181. /* This implementation assumes that the CRC fits in an unsigned long. */
  182. BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
  183. /* If the buffer is not long-aligned, align it. */
  184. align = (unsigned long)p % sizeof(unsigned long);
  185. if (align && len) {
  186. align = min(sizeof(unsigned long) - align, len);
  187. crc = crc_clmul_update_partial(crc, p, align, consts);
  188. p += align;
  189. len -= align;
  190. }
  191. if (len >= 4 * sizeof(unsigned long)) {
  192. unsigned long m0, m1;
  193. m0 = crc_clmul_prep(crc, crc_load_long(p));
  194. m1 = crc_load_long(p + sizeof(unsigned long));
  195. p += 2 * sizeof(unsigned long);
  196. len -= 2 * sizeof(unsigned long);
  197. /*
  198. * Main loop. Each iteration starts with a message polynomial
  199. * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
  200. * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
  201. * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
  202. * "folds" that back into a congruent (modulo G) value that uses
  203. * just m0 and m1 again. This is done by multiplying m0 by the
  204. * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
  205. * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
  206. * adding the results to m2 and m3 as appropriate. Each such
  207. * multiplication produces a result twice the length of a long,
  208. * which in RISC-V is two instructions clmul and clmulh.
  209. *
  210. * This could be changed to fold across more than 2 longs at a
  211. * time if there is a CPU that can take advantage of it.
  212. */
  213. do {
  214. unsigned long p0, p1, p2, p3;
  215. p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
  216. p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
  217. p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
  218. p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
  219. m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
  220. m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
  221. crc_load_long(p + sizeof(unsigned long));
  222. p += 2 * sizeof(unsigned long);
  223. len -= 2 * sizeof(unsigned long);
  224. } while (len >= 2 * sizeof(unsigned long));
  225. crc = crc_clmul_long(m0, consts);
  226. crc = crc_clmul_update_long(crc, m1, consts);
  227. }
  228. while (len >= sizeof(unsigned long)) {
  229. crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
  230. p += sizeof(unsigned long);
  231. len -= sizeof(unsigned long);
  232. }
  233. if (len)
  234. crc = crc_clmul_update_partial(crc, p, len, consts);
  235. return crc;
  236. }