polyval.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * POLYVAL library functions
  4. *
  5. * Copyright 2025 Google LLC
  6. */
  7. #include <crypto/polyval.h>
  8. #include <linux/export.h>
  9. #include <linux/module.h>
  10. #include <linux/string.h>
  11. #include <linux/unaligned.h>
  12. /*
  13. * POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL
  14. * interprets the message as the coefficients of a polynomial in GF(2^128) and
  15. * evaluates that polynomial at a secret point. POLYVAL has a simple
  16. * mathematical relationship with GHASH, but it uses a better field convention
  17. * which makes it easier and faster to implement.
  18. *
  19. * POLYVAL is not a cryptographic hash function, and it should be used only by
  20. * algorithms that are specifically designed to use it.
  21. *
  22. * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
  23. * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
  24. *
  25. * POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2"
  26. * (https://eprint.iacr.org/2021/1441.pdf).
  27. *
  28. * This file provides a library API for POLYVAL. This API can delegate to
  29. * either a generic implementation or an architecture-optimized implementation.
  30. *
  31. * For the generic implementation, we don't use the traditional table approach
  32. * to GF(2^128) multiplication. That approach is not constant-time and requires
  33. * a lot of memory. Instead, we use a different approach which emulates
  34. * carryless multiplication using standard multiplications by spreading the data
  35. * bits apart using "holes". This allows the carries to spill harmlessly. This
  36. * approach is borrowed from BoringSSL, which in turn credits BearSSL's
  37. * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
  38. * "holes" trick and a presentation by Shay Gueron
  39. * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
  40. * 256-bit => 128-bit reduction algorithm.
  41. */
  42. #ifdef CONFIG_ARCH_SUPPORTS_INT128
  43. /* Do a 64 x 64 => 128 bit carryless multiplication. */
  44. static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
  45. {
  46. /*
  47. * With 64-bit multiplicands and one term every 4 bits, there would be
  48. * up to 64 / 4 = 16 one bits per column when each multiplication is
  49. * written out as a series of additions in the schoolbook manner.
  50. * Unfortunately, that doesn't work since the value 16 is 1 too large to
  51. * fit in 4 bits. Carries would sometimes overflow into the next term.
  52. *
  53. * Using one term every 5 bits would work. However, that would cost
  54. * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
  55. *
  56. * Instead, mask off 4 bits from one multiplicand, giving a max of 15
  57. * one bits per column. Then handle those 4 bits separately.
  58. */
  59. u64 a0 = a & 0x1111111111111110;
  60. u64 a1 = a & 0x2222222222222220;
  61. u64 a2 = a & 0x4444444444444440;
  62. u64 a3 = a & 0x8888888888888880;
  63. u64 b0 = b & 0x1111111111111111;
  64. u64 b1 = b & 0x2222222222222222;
  65. u64 b2 = b & 0x4444444444444444;
  66. u64 b3 = b & 0x8888888888888888;
  67. /* Multiply the high 60 bits of @a by @b. */
  68. u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
  69. (a2 * (u128)b2) ^ (a3 * (u128)b1);
  70. u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
  71. (a2 * (u128)b3) ^ (a3 * (u128)b2);
  72. u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
  73. (a2 * (u128)b0) ^ (a3 * (u128)b3);
  74. u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
  75. (a2 * (u128)b1) ^ (a3 * (u128)b0);
  76. /* Multiply the low 4 bits of @a by @b. */
  77. u64 e0 = -(a & 1) & b;
  78. u64 e1 = -((a >> 1) & 1) & b;
  79. u64 e2 = -((a >> 2) & 1) & b;
  80. u64 e3 = -((a >> 3) & 1) & b;
  81. u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
  82. u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
  83. /* Add all the intermediate products together. */
  84. *out_lo = (((u64)c0) & 0x1111111111111111) ^
  85. (((u64)c1) & 0x2222222222222222) ^
  86. (((u64)c2) & 0x4444444444444444) ^
  87. (((u64)c3) & 0x8888888888888888) ^ extra_lo;
  88. *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
  89. (((u64)(c1 >> 64)) & 0x2222222222222222) ^
  90. (((u64)(c2 >> 64)) & 0x4444444444444444) ^
  91. (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
  92. }
  93. #else /* CONFIG_ARCH_SUPPORTS_INT128 */
  94. /* Do a 32 x 32 => 64 bit carryless multiplication. */
  95. static u64 clmul32(u32 a, u32 b)
  96. {
  97. /*
  98. * With 32-bit multiplicands and one term every 4 bits, there are up to
  99. * 32 / 4 = 8 one bits per column when each multiplication is written
  100. * out as a series of additions in the schoolbook manner. The value 8
  101. * fits in 4 bits, so the carries don't overflow into the next term.
  102. */
  103. u32 a0 = a & 0x11111111;
  104. u32 a1 = a & 0x22222222;
  105. u32 a2 = a & 0x44444444;
  106. u32 a3 = a & 0x88888888;
  107. u32 b0 = b & 0x11111111;
  108. u32 b1 = b & 0x22222222;
  109. u32 b2 = b & 0x44444444;
  110. u32 b3 = b & 0x88888888;
  111. u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
  112. (a2 * (u64)b2) ^ (a3 * (u64)b1);
  113. u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
  114. (a2 * (u64)b3) ^ (a3 * (u64)b2);
  115. u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
  116. (a2 * (u64)b0) ^ (a3 * (u64)b3);
  117. u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
  118. (a2 * (u64)b1) ^ (a3 * (u64)b0);
  119. /* Add all the intermediate products together. */
  120. return (c0 & 0x1111111111111111) ^
  121. (c1 & 0x2222222222222222) ^
  122. (c2 & 0x4444444444444444) ^
  123. (c3 & 0x8888888888888888);
  124. }
  125. /* Do a 64 x 64 => 128 bit carryless multiplication. */
  126. static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
  127. {
  128. u32 a_lo = (u32)a;
  129. u32 a_hi = a >> 32;
  130. u32 b_lo = (u32)b;
  131. u32 b_hi = b >> 32;
  132. /* Karatsuba multiplication */
  133. u64 lo = clmul32(a_lo, b_lo);
  134. u64 hi = clmul32(a_hi, b_hi);
  135. u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
  136. *out_lo = lo ^ (mi << 32);
  137. *out_hi = hi ^ (mi >> 32);
  138. }
  139. #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
  140. /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
  141. static void __maybe_unused
  142. polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
  143. {
  144. u64 c0, c1, c2, c3, mi0, mi1;
  145. /*
  146. * Carryless-multiply @a by @b using Karatsuba multiplication. Store
  147. * the 256-bit product in @c0 (low) through @c3 (high).
  148. */
  149. clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
  150. clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
  151. clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
  152. &mi0, &mi1);
  153. mi0 ^= c0 ^ c2;
  154. mi1 ^= c1 ^ c3;
  155. c1 ^= mi0;
  156. c2 ^= mi1;
  157. /*
  158. * Cancel out the low 128 bits of the product by adding multiples of
  159. * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
  160. * of which cancels out 64 bits. Note that we break G(x) into three
  161. * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
  162. */
  163. /*
  164. * First, add G(x) times c0 as follows:
  165. *
  166. * (c0, c1, c2) = (0,
  167. * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
  168. * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
  169. */
  170. c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
  171. c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
  172. /*
  173. * Second, add G(x) times the new c1:
  174. *
  175. * (c1, c2, c3) = (0,
  176. * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
  177. * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
  178. */
  179. c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
  180. c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
  181. /* Return (c2, c3). This implicitly multiplies by x^-128. */
  182. a->lo = cpu_to_le64(c2);
  183. a->hi = cpu_to_le64(c3);
  184. }
  185. static void __maybe_unused
  186. polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
  187. const u8 *data, size_t nblocks)
  188. {
  189. do {
  190. acc->lo ^= get_unaligned((__le64 *)data);
  191. acc->hi ^= get_unaligned((__le64 *)(data + 8));
  192. polyval_mul_generic(acc, key);
  193. data += POLYVAL_BLOCK_SIZE;
  194. } while (--nblocks);
  195. }
  196. /* Include the arch-optimized implementation of POLYVAL, if one is available. */
  197. #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
  198. #include "polyval.h" /* $(SRCARCH)/polyval.h */
  199. void polyval_preparekey(struct polyval_key *key,
  200. const u8 raw_key[POLYVAL_BLOCK_SIZE])
  201. {
  202. polyval_preparekey_arch(key, raw_key);
  203. }
  204. EXPORT_SYMBOL_GPL(polyval_preparekey);
  205. #endif /* Else, polyval_preparekey() is an inline function. */
  206. /*
  207. * polyval_mul_generic() and polyval_blocks_generic() take the key as a
  208. * polyval_elem rather than a polyval_key, so that arch-optimized
  209. * implementations with a different key format can use it as a fallback (if they
  210. * have H^1 stored somewhere in their struct). Thus, the following dispatch
  211. * code is needed to pass the appropriate key argument.
  212. */
  213. static void polyval_mul(struct polyval_ctx *ctx)
  214. {
  215. #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
  216. polyval_mul_arch(&ctx->acc, ctx->key);
  217. #else
  218. polyval_mul_generic(&ctx->acc, &ctx->key->h);
  219. #endif
  220. }
  221. static void polyval_blocks(struct polyval_ctx *ctx,
  222. const u8 *data, size_t nblocks)
  223. {
  224. #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
  225. polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
  226. #else
  227. polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
  228. #endif
  229. }
  230. void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
  231. {
  232. if (unlikely(ctx->partial)) {
  233. size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
  234. len -= n;
  235. while (n--)
  236. ctx->acc.bytes[ctx->partial++] ^= *data++;
  237. if (ctx->partial < POLYVAL_BLOCK_SIZE)
  238. return;
  239. polyval_mul(ctx);
  240. }
  241. if (len >= POLYVAL_BLOCK_SIZE) {
  242. size_t nblocks = len / POLYVAL_BLOCK_SIZE;
  243. polyval_blocks(ctx, data, nblocks);
  244. data += len & ~(POLYVAL_BLOCK_SIZE - 1);
  245. len &= POLYVAL_BLOCK_SIZE - 1;
  246. }
  247. for (size_t i = 0; i < len; i++)
  248. ctx->acc.bytes[i] ^= data[i];
  249. ctx->partial = len;
  250. }
  251. EXPORT_SYMBOL_GPL(polyval_update);
  252. void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
  253. {
  254. if (unlikely(ctx->partial))
  255. polyval_mul(ctx);
  256. memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
  257. memzero_explicit(ctx, sizeof(*ctx));
  258. }
  259. EXPORT_SYMBOL_GPL(polyval_final);
  260. #ifdef polyval_mod_init_arch
  261. static int __init polyval_mod_init(void)
  262. {
  263. polyval_mod_init_arch();
  264. return 0;
  265. }
  266. subsys_initcall(polyval_mod_init);
  267. static void __exit polyval_mod_exit(void)
  268. {
  269. }
  270. module_exit(polyval_mod_exit);
  271. #endif
  272. MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
  273. MODULE_LICENSE("GPL");