zstd_fast.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984
  1. // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
  2. /*
  3. * Copyright (c) Meta Platforms, Inc. and affiliates.
  4. * All rights reserved.
  5. *
  6. * This source code is licensed under both the BSD-style license (found in the
  7. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  8. * in the COPYING file in the root directory of this source tree).
  9. * You may select, at your option, one of the above-listed licenses.
  10. */
  11. #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
  12. #include "zstd_fast.h"
  13. static
  14. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  15. void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
  16. const void* const end,
  17. ZSTD_dictTableLoadMethod_e dtlm)
  18. {
  19. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  20. U32* const hashTable = ms->hashTable;
  21. U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
  22. U32 const mls = cParams->minMatch;
  23. const BYTE* const base = ms->window.base;
  24. const BYTE* ip = base + ms->nextToUpdate;
  25. const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
  26. const U32 fastHashFillStep = 3;
  27. /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
  28. * Feel free to remove this assert if there's a good reason! */
  29. assert(dtlm == ZSTD_dtlm_full);
  30. /* Always insert every fastHashFillStep position into the hash table.
  31. * Insert the other positions if their hash entry is empty.
  32. */
  33. for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
  34. U32 const curr = (U32)(ip - base);
  35. { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
  36. ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
  37. if (dtlm == ZSTD_dtlm_fast) continue;
  38. /* Only load extra positions for ZSTD_dtlm_full */
  39. { U32 p;
  40. for (p = 1; p < fastHashFillStep; ++p) {
  41. size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
  42. if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
  43. ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
  44. } } } }
  45. }
  46. static
  47. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  48. void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
  49. const void* const end,
  50. ZSTD_dictTableLoadMethod_e dtlm)
  51. {
  52. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  53. U32* const hashTable = ms->hashTable;
  54. U32 const hBits = cParams->hashLog;
  55. U32 const mls = cParams->minMatch;
  56. const BYTE* const base = ms->window.base;
  57. const BYTE* ip = base + ms->nextToUpdate;
  58. const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
  59. const U32 fastHashFillStep = 3;
  60. /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
  61. * Feel free to remove this assert if there's a good reason! */
  62. assert(dtlm == ZSTD_dtlm_fast);
  63. /* Always insert every fastHashFillStep position into the hash table.
  64. * Insert the other positions if their hash entry is empty.
  65. */
  66. for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
  67. U32 const curr = (U32)(ip - base);
  68. size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
  69. hashTable[hash0] = curr;
  70. if (dtlm == ZSTD_dtlm_fast) continue;
  71. /* Only load extra positions for ZSTD_dtlm_full */
  72. { U32 p;
  73. for (p = 1; p < fastHashFillStep; ++p) {
  74. size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
  75. if (hashTable[hash] == 0) { /* not yet filled */
  76. hashTable[hash] = curr + p;
  77. } } } }
  78. }
  79. void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
  80. const void* const end,
  81. ZSTD_dictTableLoadMethod_e dtlm,
  82. ZSTD_tableFillPurpose_e tfp)
  83. {
  84. if (tfp == ZSTD_tfp_forCDict) {
  85. ZSTD_fillHashTableForCDict(ms, end, dtlm);
  86. } else {
  87. ZSTD_fillHashTableForCCtx(ms, end, dtlm);
  88. }
  89. }
  90. typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
  91. static int
  92. ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
  93. {
  94. /* Array of ~random data, should have low probability of matching data.
  95. * Load from here if the index is invalid.
  96. * Used to avoid unpredictable branches. */
  97. static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
  98. /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
  99. * However expression below compiles into conditional move.
  100. */
  101. const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
  102. /* Note: this used to be written as : return test1 && test2;
  103. * Unfortunately, once inlined, these tests become branches,
  104. * in which case it becomes critical that they are executed in the right order (test1 then test2).
  105. * So we have to write these tests in a specific manner to ensure their ordering.
  106. */
  107. if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
  108. /* force ordering of these tests, which matters once the function is inlined, as they become branches */
  109. __asm__("");
  110. return matchIdx >= idxLowLimit;
  111. }
  112. static int
  113. ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
  114. {
  115. /* using a branch instead of a cmov,
  116. * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
  117. * aka almost all candidates are within range */
  118. U32 mval;
  119. if (matchIdx >= idxLowLimit) {
  120. mval = MEM_read32(matchAddress);
  121. } else {
  122. mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
  123. }
  124. return (MEM_read32(currentPtr) == mval);
  125. }
  126. /*
  127. * If you squint hard enough (and ignore repcodes), the search operation at any
  128. * given position is broken into 4 stages:
  129. *
  130. * 1. Hash (map position to hash value via input read)
  131. * 2. Lookup (map hash val to index via hashtable read)
  132. * 3. Load (map index to value at that position via input read)
  133. * 4. Compare
  134. *
  135. * Each of these steps involves a memory read at an address which is computed
  136. * from the previous step. This means these steps must be sequenced and their
  137. * latencies are cumulative.
  138. *
  139. * Rather than do 1->2->3->4 sequentially for a single position before moving
  140. * onto the next, this implementation interleaves these operations across the
  141. * next few positions:
  142. *
  143. * R = Repcode Read & Compare
  144. * H = Hash
  145. * T = Table Lookup
  146. * M = Match Read & Compare
  147. *
  148. * Pos | Time -->
  149. * ----+-------------------
  150. * N | ... M
  151. * N+1 | ... TM
  152. * N+2 | R H T M
  153. * N+3 | H TM
  154. * N+4 | R H T M
  155. * N+5 | H ...
  156. * N+6 | R ...
  157. *
  158. * This is very much analogous to the pipelining of execution in a CPU. And just
  159. * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
  160. * branch).
  161. *
  162. * When this happens, we throw away our current state, and do the following prep
  163. * to re-enter the loop:
  164. *
  165. * Pos | Time -->
  166. * ----+-------------------
  167. * N | H T
  168. * N+1 | H
  169. *
  170. * This is also the work we do at the beginning to enter the loop initially.
  171. */
  172. FORCE_INLINE_TEMPLATE
  173. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  174. size_t ZSTD_compressBlock_fast_noDict_generic(
  175. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  176. void const* src, size_t srcSize,
  177. U32 const mls, int useCmov)
  178. {
  179. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  180. U32* const hashTable = ms->hashTable;
  181. U32 const hlog = cParams->hashLog;
  182. size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
  183. const BYTE* const base = ms->window.base;
  184. const BYTE* const istart = (const BYTE*)src;
  185. const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
  186. const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
  187. const BYTE* const prefixStart = base + prefixStartIndex;
  188. const BYTE* const iend = istart + srcSize;
  189. const BYTE* const ilimit = iend - HASH_READ_SIZE;
  190. const BYTE* anchor = istart;
  191. const BYTE* ip0 = istart;
  192. const BYTE* ip1;
  193. const BYTE* ip2;
  194. const BYTE* ip3;
  195. U32 current0;
  196. U32 rep_offset1 = rep[0];
  197. U32 rep_offset2 = rep[1];
  198. U32 offsetSaved1 = 0, offsetSaved2 = 0;
  199. size_t hash0; /* hash for ip0 */
  200. size_t hash1; /* hash for ip1 */
  201. U32 matchIdx; /* match idx for ip0 */
  202. U32 offcode;
  203. const BYTE* match0;
  204. size_t mLength;
  205. /* ip0 and ip1 are always adjacent. The targetLength skipping and
  206. * uncompressibility acceleration is applied to every other position,
  207. * matching the behavior of #1562. step therefore represents the gap
  208. * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
  209. size_t step;
  210. const BYTE* nextStep;
  211. const size_t kStepIncr = (1 << (kSearchStrength - 1));
  212. const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
  213. DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
  214. ip0 += (ip0 == prefixStart);
  215. { U32 const curr = (U32)(ip0 - base);
  216. U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
  217. U32 const maxRep = curr - windowLow;
  218. if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
  219. if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
  220. }
  221. /* start each op */
  222. _start: /* Requires: ip0 */
  223. step = stepSize;
  224. nextStep = ip0 + kStepIncr;
  225. /* calculate positions, ip0 - anchor == 0, so we skip step calc */
  226. ip1 = ip0 + 1;
  227. ip2 = ip0 + step;
  228. ip3 = ip2 + 1;
  229. if (ip3 >= ilimit) {
  230. goto _cleanup;
  231. }
  232. hash0 = ZSTD_hashPtr(ip0, hlog, mls);
  233. hash1 = ZSTD_hashPtr(ip1, hlog, mls);
  234. matchIdx = hashTable[hash0];
  235. do {
  236. /* load repcode match for ip[2]*/
  237. const U32 rval = MEM_read32(ip2 - rep_offset1);
  238. /* write back hash table entry */
  239. current0 = (U32)(ip0 - base);
  240. hashTable[hash0] = current0;
  241. /* check repcode at ip[2] */
  242. if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
  243. ip0 = ip2;
  244. match0 = ip0 - rep_offset1;
  245. mLength = ip0[-1] == match0[-1];
  246. ip0 -= mLength;
  247. match0 -= mLength;
  248. offcode = REPCODE1_TO_OFFBASE;
  249. mLength += 4;
  250. /* Write next hash table entry: it's already calculated.
  251. * This write is known to be safe because ip1 is before the
  252. * repcode (ip2). */
  253. hashTable[hash1] = (U32)(ip1 - base);
  254. goto _match;
  255. }
  256. if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
  257. /* Write next hash table entry (it's already calculated).
  258. * This write is known to be safe because the ip1 == ip0 + 1,
  259. * so searching will resume after ip1 */
  260. hashTable[hash1] = (U32)(ip1 - base);
  261. goto _offset;
  262. }
  263. /* lookup ip[1] */
  264. matchIdx = hashTable[hash1];
  265. /* hash ip[2] */
  266. hash0 = hash1;
  267. hash1 = ZSTD_hashPtr(ip2, hlog, mls);
  268. /* advance to next positions */
  269. ip0 = ip1;
  270. ip1 = ip2;
  271. ip2 = ip3;
  272. /* write back hash table entry */
  273. current0 = (U32)(ip0 - base);
  274. hashTable[hash0] = current0;
  275. if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
  276. /* Write next hash table entry, since it's already calculated */
  277. if (step <= 4) {
  278. /* Avoid writing an index if it's >= position where search will resume.
  279. * The minimum possible match has length 4, so search can resume at ip0 + 4.
  280. */
  281. hashTable[hash1] = (U32)(ip1 - base);
  282. }
  283. goto _offset;
  284. }
  285. /* lookup ip[1] */
  286. matchIdx = hashTable[hash1];
  287. /* hash ip[2] */
  288. hash0 = hash1;
  289. hash1 = ZSTD_hashPtr(ip2, hlog, mls);
  290. /* advance to next positions */
  291. ip0 = ip1;
  292. ip1 = ip2;
  293. ip2 = ip0 + step;
  294. ip3 = ip1 + step;
  295. /* calculate step */
  296. if (ip2 >= nextStep) {
  297. step++;
  298. PREFETCH_L1(ip1 + 64);
  299. PREFETCH_L1(ip1 + 128);
  300. nextStep += kStepIncr;
  301. }
  302. } while (ip3 < ilimit);
  303. _cleanup:
  304. /* Note that there are probably still a couple positions one could search.
  305. * However, it seems to be a meaningful performance hit to try to search
  306. * them. So let's not. */
  307. /* When the repcodes are outside of the prefix, we set them to zero before the loop.
  308. * When the offsets are still zero, we need to restore them after the block to have a correct
  309. * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
  310. * offsets were invalid. We need to figure out which offset to refill with.
  311. * - If both offsets are zero they are in the same order.
  312. * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
  313. * - If only one is zero, we need to decide which offset to restore.
  314. * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
  315. * - It is impossible for rep_offset2 to be non-zero.
  316. *
  317. * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
  318. * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
  319. */
  320. offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
  321. /* save reps for next block */
  322. rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
  323. rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
  324. /* Return the last literals size */
  325. return (size_t)(iend - anchor);
  326. _offset: /* Requires: ip0, idx */
  327. /* Compute the offset code. */
  328. match0 = base + matchIdx;
  329. rep_offset2 = rep_offset1;
  330. rep_offset1 = (U32)(ip0-match0);
  331. offcode = OFFSET_TO_OFFBASE(rep_offset1);
  332. mLength = 4;
  333. /* Count the backwards match length. */
  334. while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
  335. ip0--;
  336. match0--;
  337. mLength++;
  338. }
  339. _match: /* Requires: ip0, match0, offcode */
  340. /* Count the forward length. */
  341. mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
  342. ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
  343. ip0 += mLength;
  344. anchor = ip0;
  345. /* Fill table and check for immediate repcode. */
  346. if (ip0 <= ilimit) {
  347. /* Fill Table */
  348. assert(base+current0+2 > istart); /* check base overflow */
  349. hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
  350. hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
  351. if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
  352. while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
  353. /* store sequence */
  354. size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
  355. { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
  356. hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
  357. ip0 += rLength;
  358. ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
  359. anchor = ip0;
  360. continue; /* faster when present (confirmed on gcc-8) ... (?) */
  361. } } }
  362. goto _start;
  363. }
  364. #define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \
  365. static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \
  366. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
  367. void const* src, size_t srcSize) \
  368. { \
  369. return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
  370. }
  371. ZSTD_GEN_FAST_FN(noDict, 4, 1)
  372. ZSTD_GEN_FAST_FN(noDict, 5, 1)
  373. ZSTD_GEN_FAST_FN(noDict, 6, 1)
  374. ZSTD_GEN_FAST_FN(noDict, 7, 1)
  375. ZSTD_GEN_FAST_FN(noDict, 4, 0)
  376. ZSTD_GEN_FAST_FN(noDict, 5, 0)
  377. ZSTD_GEN_FAST_FN(noDict, 6, 0)
  378. ZSTD_GEN_FAST_FN(noDict, 7, 0)
  379. size_t ZSTD_compressBlock_fast(
  380. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  381. void const* src, size_t srcSize)
  382. {
  383. U32 const mml = ms->cParams.minMatch;
  384. /* use cmov when "candidate in range" branch is likely unpredictable */
  385. int const useCmov = ms->cParams.windowLog < 19;
  386. assert(ms->dictMatchState == NULL);
  387. if (useCmov) {
  388. switch(mml)
  389. {
  390. default: /* includes case 3 */
  391. case 4 :
  392. return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
  393. case 5 :
  394. return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
  395. case 6 :
  396. return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
  397. case 7 :
  398. return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
  399. }
  400. } else {
  401. /* use a branch instead */
  402. switch(mml)
  403. {
  404. default: /* includes case 3 */
  405. case 4 :
  406. return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
  407. case 5 :
  408. return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
  409. case 6 :
  410. return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
  411. case 7 :
  412. return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
  413. }
  414. }
  415. }
  416. FORCE_INLINE_TEMPLATE
  417. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  418. size_t ZSTD_compressBlock_fast_dictMatchState_generic(
  419. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  420. void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
  421. {
  422. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  423. U32* const hashTable = ms->hashTable;
  424. U32 const hlog = cParams->hashLog;
  425. /* support stepSize of 0 */
  426. U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
  427. const BYTE* const base = ms->window.base;
  428. const BYTE* const istart = (const BYTE*)src;
  429. const BYTE* ip0 = istart;
  430. const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
  431. const BYTE* anchor = istart;
  432. const U32 prefixStartIndex = ms->window.dictLimit;
  433. const BYTE* const prefixStart = base + prefixStartIndex;
  434. const BYTE* const iend = istart + srcSize;
  435. const BYTE* const ilimit = iend - HASH_READ_SIZE;
  436. U32 offset_1=rep[0], offset_2=rep[1];
  437. const ZSTD_MatchState_t* const dms = ms->dictMatchState;
  438. const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
  439. const U32* const dictHashTable = dms->hashTable;
  440. const U32 dictStartIndex = dms->window.dictLimit;
  441. const BYTE* const dictBase = dms->window.base;
  442. const BYTE* const dictStart = dictBase + dictStartIndex;
  443. const BYTE* const dictEnd = dms->window.nextSrc;
  444. const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
  445. const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
  446. const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
  447. /* if a dictionary is still attached, it necessarily means that
  448. * it is within window size. So we just check it. */
  449. const U32 maxDistance = 1U << cParams->windowLog;
  450. const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
  451. assert(endIndex - prefixStartIndex <= maxDistance);
  452. (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
  453. (void)hasStep; /* not currently specialized on whether it's accelerated */
  454. /* ensure there will be no underflow
  455. * when translating a dict index into a local index */
  456. assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
  457. if (ms->prefetchCDictTables) {
  458. size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
  459. PREFETCH_AREA(dictHashTable, hashTableBytes);
  460. }
  461. /* init */
  462. DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
  463. ip0 += (dictAndPrefixLength == 0);
  464. /* dictMatchState repCode checks don't currently handle repCode == 0
  465. * disabling. */
  466. assert(offset_1 <= dictAndPrefixLength);
  467. assert(offset_2 <= dictAndPrefixLength);
  468. /* Outer search loop */
  469. assert(stepSize >= 1);
  470. while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
  471. size_t mLength;
  472. size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
  473. size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
  474. U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
  475. int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
  476. U32 matchIndex = hashTable[hash0];
  477. U32 curr = (U32)(ip0 - base);
  478. size_t step = stepSize;
  479. const size_t kStepIncr = 1 << kSearchStrength;
  480. const BYTE* nextStep = ip0 + kStepIncr;
  481. /* Inner search loop */
  482. while (1) {
  483. const BYTE* match = base + matchIndex;
  484. const U32 repIndex = curr + 1 - offset_1;
  485. const BYTE* repMatch = (repIndex < prefixStartIndex) ?
  486. dictBase + (repIndex - dictIndexDelta) :
  487. base + repIndex;
  488. const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
  489. size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
  490. hashTable[hash0] = curr; /* update hash table */
  491. if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
  492. && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
  493. const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
  494. mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
  495. ip0++;
  496. ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
  497. break;
  498. }
  499. if (dictTagsMatch) {
  500. /* Found a possible dict match */
  501. const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
  502. const BYTE* dictMatch = dictBase + dictMatchIndex;
  503. if (dictMatchIndex > dictStartIndex &&
  504. MEM_read32(dictMatch) == MEM_read32(ip0)) {
  505. /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
  506. if (matchIndex <= prefixStartIndex) {
  507. U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
  508. mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
  509. while (((ip0 > anchor) & (dictMatch > dictStart))
  510. && (ip0[-1] == dictMatch[-1])) {
  511. ip0--;
  512. dictMatch--;
  513. mLength++;
  514. } /* catch up */
  515. offset_2 = offset_1;
  516. offset_1 = offset;
  517. ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
  518. break;
  519. }
  520. }
  521. }
  522. if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
  523. /* found a regular match of size >= 4 */
  524. U32 const offset = (U32) (ip0 - match);
  525. mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
  526. while (((ip0 > anchor) & (match > prefixStart))
  527. && (ip0[-1] == match[-1])) {
  528. ip0--;
  529. match--;
  530. mLength++;
  531. } /* catch up */
  532. offset_2 = offset_1;
  533. offset_1 = offset;
  534. ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
  535. break;
  536. }
  537. /* Prepare for next iteration */
  538. dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
  539. dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
  540. matchIndex = hashTable[hash1];
  541. if (ip1 >= nextStep) {
  542. step++;
  543. nextStep += kStepIncr;
  544. }
  545. ip0 = ip1;
  546. ip1 = ip1 + step;
  547. if (ip1 > ilimit) goto _cleanup;
  548. curr = (U32)(ip0 - base);
  549. hash0 = hash1;
  550. } /* end inner search loop */
  551. /* match found */
  552. assert(mLength);
  553. ip0 += mLength;
  554. anchor = ip0;
  555. if (ip0 <= ilimit) {
  556. /* Fill Table */
  557. assert(base+curr+2 > istart); /* check base overflow */
  558. hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */
  559. hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
  560. /* check immediate repcode */
  561. while (ip0 <= ilimit) {
  562. U32 const current2 = (U32)(ip0-base);
  563. U32 const repIndex2 = current2 - offset_2;
  564. const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
  565. dictBase - dictIndexDelta + repIndex2 :
  566. base + repIndex2;
  567. if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
  568. && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
  569. const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
  570. size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
  571. U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
  572. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
  573. hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
  574. ip0 += repLength2;
  575. anchor = ip0;
  576. continue;
  577. }
  578. break;
  579. }
  580. }
  581. /* Prepare for next iteration */
  582. assert(ip0 == anchor);
  583. ip1 = ip0 + stepSize;
  584. }
  585. _cleanup:
  586. /* save reps for next block */
  587. rep[0] = offset_1;
  588. rep[1] = offset_2;
  589. /* Return the last literals size */
  590. return (size_t)(iend - anchor);
  591. }
  592. ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
  593. ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
  594. ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
  595. ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
  596. size_t ZSTD_compressBlock_fast_dictMatchState(
  597. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  598. void const* src, size_t srcSize)
  599. {
  600. U32 const mls = ms->cParams.minMatch;
  601. assert(ms->dictMatchState != NULL);
  602. switch(mls)
  603. {
  604. default: /* includes case 3 */
  605. case 4 :
  606. return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
  607. case 5 :
  608. return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
  609. case 6 :
  610. return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
  611. case 7 :
  612. return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
  613. }
  614. }
  615. static
  616. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  617. size_t ZSTD_compressBlock_fast_extDict_generic(
  618. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  619. void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
  620. {
  621. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  622. U32* const hashTable = ms->hashTable;
  623. U32 const hlog = cParams->hashLog;
  624. /* support stepSize of 0 */
  625. size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
  626. const BYTE* const base = ms->window.base;
  627. const BYTE* const dictBase = ms->window.dictBase;
  628. const BYTE* const istart = (const BYTE*)src;
  629. const BYTE* anchor = istart;
  630. const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
  631. const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
  632. const U32 dictStartIndex = lowLimit;
  633. const BYTE* const dictStart = dictBase + dictStartIndex;
  634. const U32 dictLimit = ms->window.dictLimit;
  635. const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
  636. const BYTE* const prefixStart = base + prefixStartIndex;
  637. const BYTE* const dictEnd = dictBase + prefixStartIndex;
  638. const BYTE* const iend = istart + srcSize;
  639. const BYTE* const ilimit = iend - 8;
  640. U32 offset_1=rep[0], offset_2=rep[1];
  641. U32 offsetSaved1 = 0, offsetSaved2 = 0;
  642. const BYTE* ip0 = istart;
  643. const BYTE* ip1;
  644. const BYTE* ip2;
  645. const BYTE* ip3;
  646. U32 current0;
  647. size_t hash0; /* hash for ip0 */
  648. size_t hash1; /* hash for ip1 */
  649. U32 idx; /* match idx for ip0 */
  650. const BYTE* idxBase; /* base pointer for idx */
  651. U32 offcode;
  652. const BYTE* match0;
  653. size_t mLength;
  654. const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
  655. size_t step;
  656. const BYTE* nextStep;
  657. const size_t kStepIncr = (1 << (kSearchStrength - 1));
  658. (void)hasStep; /* not currently specialized on whether it's accelerated */
  659. DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
  660. /* switch to "regular" variant if extDict is invalidated due to maxDistance */
  661. if (prefixStartIndex == dictStartIndex)
  662. return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
  663. { U32 const curr = (U32)(ip0 - base);
  664. U32 const maxRep = curr - dictStartIndex;
  665. if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
  666. if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
  667. }
  668. /* start each op */
  669. _start: /* Requires: ip0 */
  670. step = stepSize;
  671. nextStep = ip0 + kStepIncr;
  672. /* calculate positions, ip0 - anchor == 0, so we skip step calc */
  673. ip1 = ip0 + 1;
  674. ip2 = ip0 + step;
  675. ip3 = ip2 + 1;
  676. if (ip3 >= ilimit) {
  677. goto _cleanup;
  678. }
  679. hash0 = ZSTD_hashPtr(ip0, hlog, mls);
  680. hash1 = ZSTD_hashPtr(ip1, hlog, mls);
  681. idx = hashTable[hash0];
  682. idxBase = idx < prefixStartIndex ? dictBase : base;
  683. do {
  684. { /* load repcode match for ip[2] */
  685. U32 const current2 = (U32)(ip2 - base);
  686. U32 const repIndex = current2 - offset_1;
  687. const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
  688. U32 rval;
  689. if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
  690. & (offset_1 > 0) ) {
  691. rval = MEM_read32(repBase + repIndex);
  692. } else {
  693. rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
  694. }
  695. /* write back hash table entry */
  696. current0 = (U32)(ip0 - base);
  697. hashTable[hash0] = current0;
  698. /* check repcode at ip[2] */
  699. if (MEM_read32(ip2) == rval) {
  700. ip0 = ip2;
  701. match0 = repBase + repIndex;
  702. matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
  703. assert((match0 != prefixStart) & (match0 != dictStart));
  704. mLength = ip0[-1] == match0[-1];
  705. ip0 -= mLength;
  706. match0 -= mLength;
  707. offcode = REPCODE1_TO_OFFBASE;
  708. mLength += 4;
  709. goto _match;
  710. } }
  711. { /* load match for ip[0] */
  712. U32 const mval = idx >= dictStartIndex ?
  713. MEM_read32(idxBase + idx) :
  714. MEM_read32(ip0) ^ 1; /* guaranteed not to match */
  715. /* check match at ip[0] */
  716. if (MEM_read32(ip0) == mval) {
  717. /* found a match! */
  718. goto _offset;
  719. } }
  720. /* lookup ip[1] */
  721. idx = hashTable[hash1];
  722. idxBase = idx < prefixStartIndex ? dictBase : base;
  723. /* hash ip[2] */
  724. hash0 = hash1;
  725. hash1 = ZSTD_hashPtr(ip2, hlog, mls);
  726. /* advance to next positions */
  727. ip0 = ip1;
  728. ip1 = ip2;
  729. ip2 = ip3;
  730. /* write back hash table entry */
  731. current0 = (U32)(ip0 - base);
  732. hashTable[hash0] = current0;
  733. { /* load match for ip[0] */
  734. U32 const mval = idx >= dictStartIndex ?
  735. MEM_read32(idxBase + idx) :
  736. MEM_read32(ip0) ^ 1; /* guaranteed not to match */
  737. /* check match at ip[0] */
  738. if (MEM_read32(ip0) == mval) {
  739. /* found a match! */
  740. goto _offset;
  741. } }
  742. /* lookup ip[1] */
  743. idx = hashTable[hash1];
  744. idxBase = idx < prefixStartIndex ? dictBase : base;
  745. /* hash ip[2] */
  746. hash0 = hash1;
  747. hash1 = ZSTD_hashPtr(ip2, hlog, mls);
  748. /* advance to next positions */
  749. ip0 = ip1;
  750. ip1 = ip2;
  751. ip2 = ip0 + step;
  752. ip3 = ip1 + step;
  753. /* calculate step */
  754. if (ip2 >= nextStep) {
  755. step++;
  756. PREFETCH_L1(ip1 + 64);
  757. PREFETCH_L1(ip1 + 128);
  758. nextStep += kStepIncr;
  759. }
  760. } while (ip3 < ilimit);
  761. _cleanup:
  762. /* Note that there are probably still a couple positions we could search.
  763. * However, it seems to be a meaningful performance hit to try to search
  764. * them. So let's not. */
  765. /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
  766. * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
  767. offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
  768. /* save reps for next block */
  769. rep[0] = offset_1 ? offset_1 : offsetSaved1;
  770. rep[1] = offset_2 ? offset_2 : offsetSaved2;
  771. /* Return the last literals size */
  772. return (size_t)(iend - anchor);
  773. _offset: /* Requires: ip0, idx, idxBase */
  774. /* Compute the offset code. */
  775. { U32 const offset = current0 - idx;
  776. const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
  777. matchEnd = idx < prefixStartIndex ? dictEnd : iend;
  778. match0 = idxBase + idx;
  779. offset_2 = offset_1;
  780. offset_1 = offset;
  781. offcode = OFFSET_TO_OFFBASE(offset);
  782. mLength = 4;
  783. /* Count the backwards match length. */
  784. while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
  785. ip0--;
  786. match0--;
  787. mLength++;
  788. } }
  789. _match: /* Requires: ip0, match0, offcode, matchEnd */
  790. /* Count the forward length. */
  791. assert(matchEnd != 0);
  792. mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
  793. ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
  794. ip0 += mLength;
  795. anchor = ip0;
  796. /* write next hash table entry */
  797. if (ip1 < ip0) {
  798. hashTable[hash1] = (U32)(ip1 - base);
  799. }
  800. /* Fill table and check for immediate repcode. */
  801. if (ip0 <= ilimit) {
  802. /* Fill Table */
  803. assert(base+current0+2 > istart); /* check base overflow */
  804. hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
  805. hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
  806. while (ip0 <= ilimit) {
  807. U32 const repIndex2 = (U32)(ip0-base) - offset_2;
  808. const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
  809. if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
  810. && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
  811. const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
  812. size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
  813. { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
  814. ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
  815. hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
  816. ip0 += repLength2;
  817. anchor = ip0;
  818. continue;
  819. }
  820. break;
  821. } }
  822. goto _start;
  823. }
  824. ZSTD_GEN_FAST_FN(extDict, 4, 0)
  825. ZSTD_GEN_FAST_FN(extDict, 5, 0)
  826. ZSTD_GEN_FAST_FN(extDict, 6, 0)
  827. ZSTD_GEN_FAST_FN(extDict, 7, 0)
  828. size_t ZSTD_compressBlock_fast_extDict(
  829. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  830. void const* src, size_t srcSize)
  831. {
  832. U32 const mls = ms->cParams.minMatch;
  833. assert(ms->dictMatchState == NULL);
  834. switch(mls)
  835. {
  836. default: /* includes case 3 */
  837. case 4 :
  838. return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
  839. case 5 :
  840. return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
  841. case 6 :
  842. return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
  843. case 7 :
  844. return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
  845. }
  846. }