zstd_lazy.c 100 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200
  1. // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
  2. /*
  3. * Copyright (c) Meta Platforms, Inc. and affiliates.
  4. * All rights reserved.
  5. *
  6. * This source code is licensed under both the BSD-style license (found in the
  7. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  8. * in the COPYING file in the root directory of this source tree).
  9. * You may select, at your option, one of the above-listed licenses.
  10. */
  11. #include "zstd_compress_internal.h"
  12. #include "zstd_lazy.h"
  13. #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
  14. #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
  15. || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
  16. || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
  17. || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
  18. #define kLazySkippingStep 8
  19. /*-*************************************
  20. * Binary Tree search
  21. ***************************************/
  22. static
  23. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  24. void ZSTD_updateDUBT(ZSTD_MatchState_t* ms,
  25. const BYTE* ip, const BYTE* iend,
  26. U32 mls)
  27. {
  28. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  29. U32* const hashTable = ms->hashTable;
  30. U32 const hashLog = cParams->hashLog;
  31. U32* const bt = ms->chainTable;
  32. U32 const btLog = cParams->chainLog - 1;
  33. U32 const btMask = (1 << btLog) - 1;
  34. const BYTE* const base = ms->window.base;
  35. U32 const target = (U32)(ip - base);
  36. U32 idx = ms->nextToUpdate;
  37. if (idx != target)
  38. DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
  39. idx, target, ms->window.dictLimit);
  40. assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */
  41. (void)iend;
  42. assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */
  43. for ( ; idx < target ; idx++) {
  44. size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */
  45. U32 const matchIndex = hashTable[h];
  46. U32* const nextCandidatePtr = bt + 2*(idx&btMask);
  47. U32* const sortMarkPtr = nextCandidatePtr + 1;
  48. DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
  49. hashTable[h] = idx; /* Update Hash Table */
  50. *nextCandidatePtr = matchIndex; /* update BT like a chain */
  51. *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
  52. }
  53. ms->nextToUpdate = target;
  54. }
  55. /* ZSTD_insertDUBT1() :
  56. * sort one already inserted but unsorted position
  57. * assumption : curr >= btlow == (curr - btmask)
  58. * doesn't fail */
  59. static
  60. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  61. void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms,
  62. U32 curr, const BYTE* inputEnd,
  63. U32 nbCompares, U32 btLow,
  64. const ZSTD_dictMode_e dictMode)
  65. {
  66. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  67. U32* const bt = ms->chainTable;
  68. U32 const btLog = cParams->chainLog - 1;
  69. U32 const btMask = (1 << btLog) - 1;
  70. size_t commonLengthSmaller=0, commonLengthLarger=0;
  71. const BYTE* const base = ms->window.base;
  72. const BYTE* const dictBase = ms->window.dictBase;
  73. const U32 dictLimit = ms->window.dictLimit;
  74. const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
  75. const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
  76. const BYTE* const dictEnd = dictBase + dictLimit;
  77. const BYTE* const prefixStart = base + dictLimit;
  78. const BYTE* match;
  79. U32* smallerPtr = bt + 2*(curr&btMask);
  80. U32* largerPtr = smallerPtr + 1;
  81. U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
  82. U32 dummy32; /* to be nullified at the end */
  83. U32 const windowValid = ms->window.lowLimit;
  84. U32 const maxDistance = 1U << cParams->windowLog;
  85. U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
  86. DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
  87. curr, dictLimit, windowLow);
  88. assert(curr >= btLow);
  89. assert(ip < iend); /* condition for ZSTD_count */
  90. for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
  91. U32* const nextPtr = bt + 2*(matchIndex & btMask);
  92. size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
  93. assert(matchIndex < curr);
  94. /* note : all candidates are now supposed sorted,
  95. * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
  96. * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
  97. if ( (dictMode != ZSTD_extDict)
  98. || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
  99. || (curr < dictLimit) /* both in extDict */) {
  100. const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
  101. || (matchIndex+matchLength >= dictLimit)) ?
  102. base : dictBase;
  103. assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
  104. || (curr < dictLimit) );
  105. match = mBase + matchIndex;
  106. matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
  107. } else {
  108. match = dictBase + matchIndex;
  109. matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
  110. if (matchIndex+matchLength >= dictLimit)
  111. match = base + matchIndex; /* preparation for next read of match[matchLength] */
  112. }
  113. DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
  114. curr, matchIndex, (U32)matchLength);
  115. if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
  116. break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
  117. }
  118. if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
  119. /* match is smaller than current */
  120. *smallerPtr = matchIndex; /* update smaller idx */
  121. commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
  122. if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */
  123. DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
  124. matchIndex, btLow, nextPtr[1]);
  125. smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */
  126. matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */
  127. } else {
  128. /* match is larger than current */
  129. *largerPtr = matchIndex;
  130. commonLengthLarger = matchLength;
  131. if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */
  132. DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
  133. matchIndex, btLow, nextPtr[0]);
  134. largerPtr = nextPtr;
  135. matchIndex = nextPtr[0];
  136. } }
  137. *smallerPtr = *largerPtr = 0;
  138. }
  139. static
  140. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  141. size_t ZSTD_DUBT_findBetterDictMatch (
  142. const ZSTD_MatchState_t* ms,
  143. const BYTE* const ip, const BYTE* const iend,
  144. size_t* offsetPtr,
  145. size_t bestLength,
  146. U32 nbCompares,
  147. U32 const mls,
  148. const ZSTD_dictMode_e dictMode)
  149. {
  150. const ZSTD_MatchState_t * const dms = ms->dictMatchState;
  151. const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
  152. const U32 * const dictHashTable = dms->hashTable;
  153. U32 const hashLog = dmsCParams->hashLog;
  154. size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
  155. U32 dictMatchIndex = dictHashTable[h];
  156. const BYTE* const base = ms->window.base;
  157. const BYTE* const prefixStart = base + ms->window.dictLimit;
  158. U32 const curr = (U32)(ip-base);
  159. const BYTE* const dictBase = dms->window.base;
  160. const BYTE* const dictEnd = dms->window.nextSrc;
  161. U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
  162. U32 const dictLowLimit = dms->window.lowLimit;
  163. U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
  164. U32* const dictBt = dms->chainTable;
  165. U32 const btLog = dmsCParams->chainLog - 1;
  166. U32 const btMask = (1 << btLog) - 1;
  167. U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
  168. size_t commonLengthSmaller=0, commonLengthLarger=0;
  169. (void)dictMode;
  170. assert(dictMode == ZSTD_dictMatchState);
  171. for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
  172. U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
  173. size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
  174. const BYTE* match = dictBase + dictMatchIndex;
  175. matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
  176. if (dictMatchIndex+matchLength >= dictHighLimit)
  177. match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */
  178. if (matchLength > bestLength) {
  179. U32 matchIndex = dictMatchIndex + dictIndexDelta;
  180. if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
  181. DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
  182. curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
  183. bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  184. }
  185. if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
  186. break; /* drop, to guarantee consistency (miss a little bit of compression) */
  187. }
  188. }
  189. if (match[matchLength] < ip[matchLength]) {
  190. if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
  191. commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
  192. dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
  193. } else {
  194. /* match is larger than current */
  195. if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
  196. commonLengthLarger = matchLength;
  197. dictMatchIndex = nextPtr[0];
  198. }
  199. }
  200. if (bestLength >= MINMATCH) {
  201. U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
  202. DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
  203. curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
  204. }
  205. return bestLength;
  206. }
  207. static
  208. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  209. size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms,
  210. const BYTE* const ip, const BYTE* const iend,
  211. size_t* offBasePtr,
  212. U32 const mls,
  213. const ZSTD_dictMode_e dictMode)
  214. {
  215. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  216. U32* const hashTable = ms->hashTable;
  217. U32 const hashLog = cParams->hashLog;
  218. size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
  219. U32 matchIndex = hashTable[h];
  220. const BYTE* const base = ms->window.base;
  221. U32 const curr = (U32)(ip-base);
  222. U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
  223. U32* const bt = ms->chainTable;
  224. U32 const btLog = cParams->chainLog - 1;
  225. U32 const btMask = (1 << btLog) - 1;
  226. U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
  227. U32 const unsortLimit = MAX(btLow, windowLow);
  228. U32* nextCandidate = bt + 2*(matchIndex&btMask);
  229. U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;
  230. U32 nbCompares = 1U << cParams->searchLog;
  231. U32 nbCandidates = nbCompares;
  232. U32 previousCandidate = 0;
  233. DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
  234. assert(ip <= iend-8); /* required for h calculation */
  235. assert(dictMode != ZSTD_dedicatedDictSearch);
  236. /* reach end of unsorted candidates list */
  237. while ( (matchIndex > unsortLimit)
  238. && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
  239. && (nbCandidates > 1) ) {
  240. DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
  241. matchIndex);
  242. *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */
  243. previousCandidate = matchIndex;
  244. matchIndex = *nextCandidate;
  245. nextCandidate = bt + 2*(matchIndex&btMask);
  246. unsortedMark = bt + 2*(matchIndex&btMask) + 1;
  247. nbCandidates --;
  248. }
  249. /* nullify last candidate if it's still unsorted
  250. * simplification, detrimental to compression ratio, beneficial for speed */
  251. if ( (matchIndex > unsortLimit)
  252. && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
  253. DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
  254. matchIndex);
  255. *nextCandidate = *unsortedMark = 0;
  256. }
  257. /* batch sort stacked candidates */
  258. matchIndex = previousCandidate;
  259. while (matchIndex) { /* will end on matchIndex == 0 */
  260. U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
  261. U32 const nextCandidateIdx = *nextCandidateIdxPtr;
  262. ZSTD_insertDUBT1(ms, matchIndex, iend,
  263. nbCandidates, unsortLimit, dictMode);
  264. matchIndex = nextCandidateIdx;
  265. nbCandidates++;
  266. }
  267. /* find longest match */
  268. { size_t commonLengthSmaller = 0, commonLengthLarger = 0;
  269. const BYTE* const dictBase = ms->window.dictBase;
  270. const U32 dictLimit = ms->window.dictLimit;
  271. const BYTE* const dictEnd = dictBase + dictLimit;
  272. const BYTE* const prefixStart = base + dictLimit;
  273. U32* smallerPtr = bt + 2*(curr&btMask);
  274. U32* largerPtr = bt + 2*(curr&btMask) + 1;
  275. U32 matchEndIdx = curr + 8 + 1;
  276. U32 dummy32; /* to be nullified at the end */
  277. size_t bestLength = 0;
  278. matchIndex = hashTable[h];
  279. hashTable[h] = curr; /* Update Hash Table */
  280. for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
  281. U32* const nextPtr = bt + 2*(matchIndex & btMask);
  282. size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
  283. const BYTE* match;
  284. if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
  285. match = base + matchIndex;
  286. matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
  287. } else {
  288. match = dictBase + matchIndex;
  289. matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
  290. if (matchIndex+matchLength >= dictLimit)
  291. match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
  292. }
  293. if (matchLength > bestLength) {
  294. if (matchLength > matchEndIdx - matchIndex)
  295. matchEndIdx = matchIndex + (U32)matchLength;
  296. if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
  297. bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  298. if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
  299. if (dictMode == ZSTD_dictMatchState) {
  300. nbCompares = 0; /* in addition to avoiding checking any
  301. * further in this loop, make sure we
  302. * skip checking in the dictionary. */
  303. }
  304. break; /* drop, to guarantee consistency (miss a little bit of compression) */
  305. }
  306. }
  307. if (match[matchLength] < ip[matchLength]) {
  308. /* match is smaller than current */
  309. *smallerPtr = matchIndex; /* update smaller idx */
  310. commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
  311. if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
  312. smallerPtr = nextPtr+1; /* new "smaller" => larger of match */
  313. matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
  314. } else {
  315. /* match is larger than current */
  316. *largerPtr = matchIndex;
  317. commonLengthLarger = matchLength;
  318. if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
  319. largerPtr = nextPtr;
  320. matchIndex = nextPtr[0];
  321. } }
  322. *smallerPtr = *largerPtr = 0;
  323. assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
  324. if (dictMode == ZSTD_dictMatchState && nbCompares) {
  325. bestLength = ZSTD_DUBT_findBetterDictMatch(
  326. ms, ip, iend,
  327. offBasePtr, bestLength, nbCompares,
  328. mls, dictMode);
  329. }
  330. assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
  331. ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
  332. if (bestLength >= MINMATCH) {
  333. U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
  334. DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
  335. curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
  336. }
  337. return bestLength;
  338. }
  339. }
  340. /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
  341. FORCE_INLINE_TEMPLATE
  342. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  343. size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms,
  344. const BYTE* const ip, const BYTE* const iLimit,
  345. size_t* offBasePtr,
  346. const U32 mls /* template */,
  347. const ZSTD_dictMode_e dictMode)
  348. {
  349. DEBUGLOG(7, "ZSTD_BtFindBestMatch");
  350. if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
  351. ZSTD_updateDUBT(ms, ip, iLimit, mls);
  352. return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
  353. }
  354. /* *********************************
  355. * Dedicated dict search
  356. ***********************************/
  357. void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip)
  358. {
  359. const BYTE* const base = ms->window.base;
  360. U32 const target = (U32)(ip - base);
  361. U32* const hashTable = ms->hashTable;
  362. U32* const chainTable = ms->chainTable;
  363. U32 const chainSize = 1 << ms->cParams.chainLog;
  364. U32 idx = ms->nextToUpdate;
  365. U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
  366. U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
  367. U32 const cacheSize = bucketSize - 1;
  368. U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
  369. U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
  370. /* We know the hashtable is oversized by a factor of `bucketSize`.
  371. * We are going to temporarily pretend `bucketSize == 1`, keeping only a
  372. * single entry. We will use the rest of the space to construct a temporary
  373. * chaintable.
  374. */
  375. U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
  376. U32* const tmpHashTable = hashTable;
  377. U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
  378. U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
  379. U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
  380. U32 hashIdx;
  381. assert(ms->cParams.chainLog <= 24);
  382. assert(ms->cParams.hashLog > ms->cParams.chainLog);
  383. assert(idx != 0);
  384. assert(tmpMinChain <= minChain);
  385. /* fill conventional hash table and conventional chain table */
  386. for ( ; idx < target; idx++) {
  387. U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
  388. if (idx >= tmpMinChain) {
  389. tmpChainTable[idx - tmpMinChain] = hashTable[h];
  390. }
  391. tmpHashTable[h] = idx;
  392. }
  393. /* sort chains into ddss chain table */
  394. {
  395. U32 chainPos = 0;
  396. for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
  397. U32 count;
  398. U32 countBeyondMinChain = 0;
  399. U32 i = tmpHashTable[hashIdx];
  400. for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
  401. /* skip through the chain to the first position that won't be
  402. * in the hash cache bucket */
  403. if (i < minChain) {
  404. countBeyondMinChain++;
  405. }
  406. i = tmpChainTable[i - tmpMinChain];
  407. }
  408. if (count == cacheSize) {
  409. for (count = 0; count < chainLimit;) {
  410. if (i < minChain) {
  411. if (!i || ++countBeyondMinChain > cacheSize) {
  412. /* only allow pulling `cacheSize` number of entries
  413. * into the cache or chainTable beyond `minChain`,
  414. * to replace the entries pulled out of the
  415. * chainTable into the cache. This lets us reach
  416. * back further without increasing the total number
  417. * of entries in the chainTable, guaranteeing the
  418. * DDSS chain table will fit into the space
  419. * allocated for the regular one. */
  420. break;
  421. }
  422. }
  423. chainTable[chainPos++] = i;
  424. count++;
  425. if (i < tmpMinChain) {
  426. break;
  427. }
  428. i = tmpChainTable[i - tmpMinChain];
  429. }
  430. } else {
  431. count = 0;
  432. }
  433. if (count) {
  434. tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
  435. } else {
  436. tmpHashTable[hashIdx] = 0;
  437. }
  438. }
  439. assert(chainPos <= chainSize); /* I believe this is guaranteed... */
  440. }
  441. /* move chain pointers into the last entry of each hash bucket */
  442. for (hashIdx = (1 << hashLog); hashIdx; ) {
  443. U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
  444. U32 const chainPackedPointer = tmpHashTable[hashIdx];
  445. U32 i;
  446. for (i = 0; i < cacheSize; i++) {
  447. hashTable[bucketIdx + i] = 0;
  448. }
  449. hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
  450. }
  451. /* fill the buckets of the hash table */
  452. for (idx = ms->nextToUpdate; idx < target; idx++) {
  453. U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
  454. << ZSTD_LAZY_DDSS_BUCKET_LOG;
  455. U32 i;
  456. /* Shift hash cache down 1. */
  457. for (i = cacheSize - 1; i; i--)
  458. hashTable[h + i] = hashTable[h + i - 1];
  459. hashTable[h] = idx;
  460. }
  461. ms->nextToUpdate = target;
  462. }
  463. /* Returns the longest match length found in the dedicated dict search structure.
  464. * If none are longer than the argument ml, then ml will be returned.
  465. */
  466. FORCE_INLINE_TEMPLATE
  467. size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
  468. const ZSTD_MatchState_t* const dms,
  469. const BYTE* const ip, const BYTE* const iLimit,
  470. const BYTE* const prefixStart, const U32 curr,
  471. const U32 dictLimit, const size_t ddsIdx) {
  472. const U32 ddsLowestIndex = dms->window.dictLimit;
  473. const BYTE* const ddsBase = dms->window.base;
  474. const BYTE* const ddsEnd = dms->window.nextSrc;
  475. const U32 ddsSize = (U32)(ddsEnd - ddsBase);
  476. const U32 ddsIndexDelta = dictLimit - ddsSize;
  477. const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
  478. const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
  479. U32 ddsAttempt;
  480. U32 matchIndex;
  481. for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
  482. PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
  483. }
  484. {
  485. U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
  486. U32 const chainIndex = chainPackedPointer >> 8;
  487. PREFETCH_L1(&dms->chainTable[chainIndex]);
  488. }
  489. for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
  490. size_t currentMl=0;
  491. const BYTE* match;
  492. matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
  493. match = ddsBase + matchIndex;
  494. if (!matchIndex) {
  495. return ml;
  496. }
  497. /* guaranteed by table construction */
  498. (void)ddsLowestIndex;
  499. assert(matchIndex >= ddsLowestIndex);
  500. assert(match+4 <= ddsEnd);
  501. if (MEM_read32(match) == MEM_read32(ip)) {
  502. /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  503. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
  504. }
  505. /* save best solution */
  506. if (currentMl > ml) {
  507. ml = currentMl;
  508. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
  509. if (ip+currentMl == iLimit) {
  510. /* best possible, avoids read overflow on next attempt */
  511. return ml;
  512. }
  513. }
  514. }
  515. {
  516. U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
  517. U32 chainIndex = chainPackedPointer >> 8;
  518. U32 const chainLength = chainPackedPointer & 0xFF;
  519. U32 const chainAttempts = nbAttempts - ddsAttempt;
  520. U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
  521. U32 chainAttempt;
  522. for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
  523. PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
  524. }
  525. for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
  526. size_t currentMl=0;
  527. const BYTE* match;
  528. matchIndex = dms->chainTable[chainIndex];
  529. match = ddsBase + matchIndex;
  530. /* guaranteed by table construction */
  531. assert(matchIndex >= ddsLowestIndex);
  532. assert(match+4 <= ddsEnd);
  533. if (MEM_read32(match) == MEM_read32(ip)) {
  534. /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  535. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
  536. }
  537. /* save best solution */
  538. if (currentMl > ml) {
  539. ml = currentMl;
  540. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
  541. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  542. }
  543. }
  544. }
  545. return ml;
  546. }
  547. /* *********************************
  548. * Hash Chain
  549. ***********************************/
  550. #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
  551. /* Update chains up to ip (excluded)
  552. Assumption : always within prefix (i.e. not within extDict) */
  553. FORCE_INLINE_TEMPLATE
  554. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  555. U32 ZSTD_insertAndFindFirstIndex_internal(
  556. ZSTD_MatchState_t* ms,
  557. const ZSTD_compressionParameters* const cParams,
  558. const BYTE* ip, U32 const mls, U32 const lazySkipping)
  559. {
  560. U32* const hashTable = ms->hashTable;
  561. const U32 hashLog = cParams->hashLog;
  562. U32* const chainTable = ms->chainTable;
  563. const U32 chainMask = (1 << cParams->chainLog) - 1;
  564. const BYTE* const base = ms->window.base;
  565. const U32 target = (U32)(ip - base);
  566. U32 idx = ms->nextToUpdate;
  567. while(idx < target) { /* catch up */
  568. size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
  569. NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
  570. hashTable[h] = idx;
  571. idx++;
  572. /* Stop inserting every position when in the lazy skipping mode. */
  573. if (lazySkipping)
  574. break;
  575. }
  576. ms->nextToUpdate = target;
  577. return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
  578. }
  579. U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) {
  580. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  581. return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
  582. }
  583. /* inlining is important to hardwire a hot branch (template emulation) */
  584. FORCE_INLINE_TEMPLATE
  585. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  586. size_t ZSTD_HcFindBestMatch(
  587. ZSTD_MatchState_t* ms,
  588. const BYTE* const ip, const BYTE* const iLimit,
  589. size_t* offsetPtr,
  590. const U32 mls, const ZSTD_dictMode_e dictMode)
  591. {
  592. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  593. U32* const chainTable = ms->chainTable;
  594. const U32 chainSize = (1 << cParams->chainLog);
  595. const U32 chainMask = chainSize-1;
  596. const BYTE* const base = ms->window.base;
  597. const BYTE* const dictBase = ms->window.dictBase;
  598. const U32 dictLimit = ms->window.dictLimit;
  599. const BYTE* const prefixStart = base + dictLimit;
  600. const BYTE* const dictEnd = dictBase + dictLimit;
  601. const U32 curr = (U32)(ip-base);
  602. const U32 maxDistance = 1U << cParams->windowLog;
  603. const U32 lowestValid = ms->window.lowLimit;
  604. const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
  605. const U32 isDictionary = (ms->loadedDictEnd != 0);
  606. const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
  607. const U32 minChain = curr > chainSize ? curr - chainSize : 0;
  608. U32 nbAttempts = 1U << cParams->searchLog;
  609. size_t ml=4-1;
  610. const ZSTD_MatchState_t* const dms = ms->dictMatchState;
  611. const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
  612. ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
  613. const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
  614. ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
  615. U32 matchIndex;
  616. if (dictMode == ZSTD_dedicatedDictSearch) {
  617. const U32* entry = &dms->hashTable[ddsIdx];
  618. PREFETCH_L1(entry);
  619. }
  620. /* HC4 match finder */
  621. matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
  622. for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
  623. size_t currentMl=0;
  624. if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
  625. const BYTE* const match = base + matchIndex;
  626. assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
  627. /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
  628. if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
  629. currentMl = ZSTD_count(ip, match, iLimit);
  630. } else {
  631. const BYTE* const match = dictBase + matchIndex;
  632. assert(match+4 <= dictEnd);
  633. if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  634. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
  635. }
  636. /* save best solution */
  637. if (currentMl > ml) {
  638. ml = currentMl;
  639. *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  640. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  641. }
  642. if (matchIndex <= minChain) break;
  643. matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
  644. }
  645. assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
  646. if (dictMode == ZSTD_dedicatedDictSearch) {
  647. ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
  648. ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
  649. } else if (dictMode == ZSTD_dictMatchState) {
  650. const U32* const dmsChainTable = dms->chainTable;
  651. const U32 dmsChainSize = (1 << dms->cParams.chainLog);
  652. const U32 dmsChainMask = dmsChainSize - 1;
  653. const U32 dmsLowestIndex = dms->window.dictLimit;
  654. const BYTE* const dmsBase = dms->window.base;
  655. const BYTE* const dmsEnd = dms->window.nextSrc;
  656. const U32 dmsSize = (U32)(dmsEnd - dmsBase);
  657. const U32 dmsIndexDelta = dictLimit - dmsSize;
  658. const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
  659. matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
  660. for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
  661. size_t currentMl=0;
  662. const BYTE* const match = dmsBase + matchIndex;
  663. assert(match+4 <= dmsEnd);
  664. if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  665. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
  666. /* save best solution */
  667. if (currentMl > ml) {
  668. ml = currentMl;
  669. assert(curr > matchIndex + dmsIndexDelta);
  670. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
  671. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  672. }
  673. if (matchIndex <= dmsMinChain) break;
  674. matchIndex = dmsChainTable[matchIndex & dmsChainMask];
  675. }
  676. }
  677. return ml;
  678. }
  679. /* *********************************
  680. * (SIMD) Row-based matchfinder
  681. ***********************************/
  682. /* Constants for row-based hash */
  683. #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
  684. #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
  685. #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
  686. typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
  687. /* ZSTD_VecMask_next():
  688. * Starting from the LSB, returns the idx of the next non-zero bit.
  689. * Basically counting the nb of trailing zeroes.
  690. */
  691. MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
  692. return ZSTD_countTrailingZeros64(val);
  693. }
  694. /* ZSTD_row_nextIndex():
  695. * Returns the next index to insert at within a tagTable row, and updates the "head"
  696. * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
  697. */
  698. FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
  699. U32 next = (*tagRow-1) & rowMask;
  700. next += (next == 0) ? rowMask : 0; /* skip first position */
  701. *tagRow = (BYTE)next;
  702. return next;
  703. }
  704. /* ZSTD_isAligned():
  705. * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
  706. */
  707. MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
  708. assert((align & (align - 1)) == 0);
  709. return (((size_t)ptr) & (align - 1)) == 0;
  710. }
  711. /* ZSTD_row_prefetch():
  712. * Performs prefetching for the hashTable and tagTable at a given row.
  713. */
  714. FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
  715. PREFETCH_L1(hashTable + relRow);
  716. if (rowLog >= 5) {
  717. PREFETCH_L1(hashTable + relRow + 16);
  718. /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
  719. }
  720. PREFETCH_L1(tagTable + relRow);
  721. if (rowLog == 6) {
  722. PREFETCH_L1(tagTable + relRow + 32);
  723. }
  724. assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
  725. assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
  726. assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
  727. }
  728. /* ZSTD_row_fillHashCache():
  729. * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
  730. * but not beyond iLimit.
  731. */
  732. FORCE_INLINE_TEMPLATE
  733. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  734. void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base,
  735. U32 const rowLog, U32 const mls,
  736. U32 idx, const BYTE* const iLimit)
  737. {
  738. U32 const* const hashTable = ms->hashTable;
  739. BYTE const* const tagTable = ms->tagTable;
  740. U32 const hashLog = ms->rowHashLog;
  741. U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
  742. U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
  743. for (; idx < lim; ++idx) {
  744. U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
  745. U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  746. ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
  747. ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
  748. }
  749. DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
  750. ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
  751. ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
  752. }
  753. /* ZSTD_row_nextCachedHash():
  754. * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
  755. * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
  756. */
  757. FORCE_INLINE_TEMPLATE
  758. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  759. U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
  760. BYTE const* tagTable, BYTE const* base,
  761. U32 idx, U32 const hashLog,
  762. U32 const rowLog, U32 const mls,
  763. U64 const hashSalt)
  764. {
  765. U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
  766. U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  767. ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
  768. { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
  769. cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
  770. return hash;
  771. }
  772. }
  773. /* ZSTD_row_update_internalImpl():
  774. * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
  775. */
  776. FORCE_INLINE_TEMPLATE
  777. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  778. void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms,
  779. U32 updateStartIdx, U32 const updateEndIdx,
  780. U32 const mls, U32 const rowLog,
  781. U32 const rowMask, U32 const useCache)
  782. {
  783. U32* const hashTable = ms->hashTable;
  784. BYTE* const tagTable = ms->tagTable;
  785. U32 const hashLog = ms->rowHashLog;
  786. const BYTE* const base = ms->window.base;
  787. DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
  788. for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
  789. U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
  790. : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
  791. U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  792. U32* const row = hashTable + relRow;
  793. BYTE* tagRow = tagTable + relRow;
  794. U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
  795. assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
  796. tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
  797. row[pos] = updateStartIdx;
  798. }
  799. }
  800. /* ZSTD_row_update_internal():
  801. * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
  802. * Skips sections of long matches as is necessary.
  803. */
  804. FORCE_INLINE_TEMPLATE
  805. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  806. void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip,
  807. U32 const mls, U32 const rowLog,
  808. U32 const rowMask, U32 const useCache)
  809. {
  810. U32 idx = ms->nextToUpdate;
  811. const BYTE* const base = ms->window.base;
  812. const U32 target = (U32)(ip - base);
  813. const U32 kSkipThreshold = 384;
  814. const U32 kMaxMatchStartPositionsToUpdate = 96;
  815. const U32 kMaxMatchEndPositionsToUpdate = 32;
  816. if (useCache) {
  817. /* Only skip positions when using hash cache, i.e.
  818. * if we are loading a dict, don't skip anything.
  819. * If we decide to skip, then we only update a set number
  820. * of positions at the beginning and end of the match.
  821. */
  822. if (UNLIKELY(target - idx > kSkipThreshold)) {
  823. U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
  824. ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
  825. idx = target - kMaxMatchEndPositionsToUpdate;
  826. ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
  827. }
  828. }
  829. assert(target >= idx);
  830. ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
  831. ms->nextToUpdate = target;
  832. }
  833. /* ZSTD_row_update():
  834. * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
  835. * processing.
  836. */
  837. void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) {
  838. const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  839. const U32 rowMask = (1u << rowLog) - 1;
  840. const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
  841. DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
  842. ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
  843. }
  844. /* Returns the mask width of bits group of which will be set to 1. Given not all
  845. * architectures have easy movemask instruction, this helps to iterate over
  846. * groups of bits easier and faster.
  847. */
  848. FORCE_INLINE_TEMPLATE U32
  849. ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
  850. {
  851. assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
  852. assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
  853. (void)rowEntries;
  854. #if defined(ZSTD_ARCH_ARM_NEON)
  855. /* NEON path only works for little endian */
  856. if (!MEM_isLittleEndian()) {
  857. return 1;
  858. }
  859. if (rowEntries == 16) {
  860. return 4;
  861. }
  862. if (rowEntries == 32) {
  863. return 2;
  864. }
  865. if (rowEntries == 64) {
  866. return 1;
  867. }
  868. #endif
  869. return 1;
  870. }
  871. #if defined(ZSTD_ARCH_X86_SSE2)
  872. FORCE_INLINE_TEMPLATE ZSTD_VecMask
  873. ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
  874. {
  875. const __m128i comparisonMask = _mm_set1_epi8((char)tag);
  876. int matches[4] = {0};
  877. int i;
  878. assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
  879. for (i=0; i<nbChunks; i++) {
  880. const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
  881. const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
  882. matches[i] = _mm_movemask_epi8(equalMask);
  883. }
  884. if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
  885. if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
  886. assert(nbChunks == 4);
  887. return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
  888. }
  889. #endif
  890. #if defined(ZSTD_ARCH_ARM_NEON)
  891. FORCE_INLINE_TEMPLATE ZSTD_VecMask
  892. ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
  893. {
  894. assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
  895. if (rowEntries == 16) {
  896. /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
  897. * After that groups of 4 bits represent the equalMask. We lower
  898. * all bits except the highest in these groups by doing AND with
  899. * 0x88 = 0b10001000.
  900. */
  901. const uint8x16_t chunk = vld1q_u8(src);
  902. const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
  903. const uint8x8_t res = vshrn_n_u16(equalMask, 4);
  904. const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
  905. return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
  906. } else if (rowEntries == 32) {
  907. /* Same idea as with rowEntries == 16 but doing AND with
  908. * 0x55 = 0b01010101.
  909. */
  910. const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
  911. const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
  912. const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
  913. const uint8x16_t dup = vdupq_n_u8(tag);
  914. const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
  915. const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
  916. const uint8x8_t res = vsli_n_u8(t0, t1, 4);
  917. const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
  918. return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
  919. } else { /* rowEntries == 64 */
  920. const uint8x16x4_t chunk = vld4q_u8(src);
  921. const uint8x16_t dup = vdupq_n_u8(tag);
  922. const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
  923. const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
  924. const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
  925. const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
  926. const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
  927. const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
  928. const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
  929. const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
  930. const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
  931. const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
  932. return ZSTD_rotateRight_U64(matches, headGrouped);
  933. }
  934. }
  935. #endif
  936. /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
  937. * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
  938. * matches the hash at the nth position in a row of the tagTable.
  939. * Each row is a circular buffer beginning at the value of "headGrouped". So we
  940. * must rotate the "matches" bitfield to match up with the actual layout of the
  941. * entries within the hashTable */
  942. FORCE_INLINE_TEMPLATE ZSTD_VecMask
  943. ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
  944. {
  945. const BYTE* const src = tagRow;
  946. assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
  947. assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
  948. assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
  949. #if defined(ZSTD_ARCH_X86_SSE2)
  950. return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
  951. #else /* SW or NEON-LE */
  952. # if defined(ZSTD_ARCH_ARM_NEON)
  953. /* This NEON path only works for little endian - otherwise use SWAR below */
  954. if (MEM_isLittleEndian()) {
  955. return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
  956. }
  957. # endif /* ZSTD_ARCH_ARM_NEON */
  958. /* SWAR */
  959. { const int chunkSize = sizeof(size_t);
  960. const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
  961. const size_t xFF = ~((size_t)0);
  962. const size_t x01 = xFF / 0xFF;
  963. const size_t x80 = x01 << 7;
  964. const size_t splatChar = tag * x01;
  965. ZSTD_VecMask matches = 0;
  966. int i = rowEntries - chunkSize;
  967. assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
  968. if (MEM_isLittleEndian()) { /* runtime check so have two loops */
  969. const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
  970. do {
  971. size_t chunk = MEM_readST(&src[i]);
  972. chunk ^= splatChar;
  973. chunk = (((chunk | x80) - x01) | chunk) & x80;
  974. matches <<= chunkSize;
  975. matches |= (chunk * extractMagic) >> shiftAmount;
  976. i -= chunkSize;
  977. } while (i >= 0);
  978. } else { /* big endian: reverse bits during extraction */
  979. const size_t msb = xFF ^ (xFF >> 1);
  980. const size_t extractMagic = (msb / 0x1FF) | msb;
  981. do {
  982. size_t chunk = MEM_readST(&src[i]);
  983. chunk ^= splatChar;
  984. chunk = (((chunk | x80) - x01) | chunk) & x80;
  985. matches <<= chunkSize;
  986. matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
  987. i -= chunkSize;
  988. } while (i >= 0);
  989. }
  990. matches = ~matches;
  991. if (rowEntries == 16) {
  992. return ZSTD_rotateRight_U16((U16)matches, headGrouped);
  993. } else if (rowEntries == 32) {
  994. return ZSTD_rotateRight_U32((U32)matches, headGrouped);
  995. } else {
  996. return ZSTD_rotateRight_U64((U64)matches, headGrouped);
  997. }
  998. }
  999. #endif
  1000. }
  1001. /* The high-level approach of the SIMD row based match finder is as follows:
  1002. * - Figure out where to insert the new entry:
  1003. * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index.
  1004. * - The hash is salted by a value that changes on every context reset, so when the same table is used
  1005. * we will avoid collisions that would otherwise slow us down by introducing phantom matches.
  1006. * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
  1007. * which row to insert into.
  1008. * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
  1009. * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
  1010. * per row).
  1011. * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
  1012. * generate a bitfield that we can cycle through to check the collisions in the hash table.
  1013. * - Pick the longest match.
  1014. * - Insert the tag into the equivalent row and position in the tagTable.
  1015. */
  1016. FORCE_INLINE_TEMPLATE
  1017. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  1018. size_t ZSTD_RowFindBestMatch(
  1019. ZSTD_MatchState_t* ms,
  1020. const BYTE* const ip, const BYTE* const iLimit,
  1021. size_t* offsetPtr,
  1022. const U32 mls, const ZSTD_dictMode_e dictMode,
  1023. const U32 rowLog)
  1024. {
  1025. U32* const hashTable = ms->hashTable;
  1026. BYTE* const tagTable = ms->tagTable;
  1027. U32* const hashCache = ms->hashCache;
  1028. const U32 hashLog = ms->rowHashLog;
  1029. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  1030. const BYTE* const base = ms->window.base;
  1031. const BYTE* const dictBase = ms->window.dictBase;
  1032. const U32 dictLimit = ms->window.dictLimit;
  1033. const BYTE* const prefixStart = base + dictLimit;
  1034. const BYTE* const dictEnd = dictBase + dictLimit;
  1035. const U32 curr = (U32)(ip-base);
  1036. const U32 maxDistance = 1U << cParams->windowLog;
  1037. const U32 lowestValid = ms->window.lowLimit;
  1038. const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
  1039. const U32 isDictionary = (ms->loadedDictEnd != 0);
  1040. const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
  1041. const U32 rowEntries = (1U << rowLog);
  1042. const U32 rowMask = rowEntries - 1;
  1043. const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
  1044. const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
  1045. const U64 hashSalt = ms->hashSalt;
  1046. U32 nbAttempts = 1U << cappedSearchLog;
  1047. size_t ml=4-1;
  1048. U32 hash;
  1049. /* DMS/DDS variables that may be referenced laster */
  1050. const ZSTD_MatchState_t* const dms = ms->dictMatchState;
  1051. /* Initialize the following variables to satisfy static analyzer */
  1052. size_t ddsIdx = 0;
  1053. U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
  1054. U32 dmsTag = 0;
  1055. U32* dmsRow = NULL;
  1056. BYTE* dmsTagRow = NULL;
  1057. if (dictMode == ZSTD_dedicatedDictSearch) {
  1058. const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
  1059. { /* Prefetch DDS hashtable entry */
  1060. ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
  1061. PREFETCH_L1(&dms->hashTable[ddsIdx]);
  1062. }
  1063. ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
  1064. }
  1065. if (dictMode == ZSTD_dictMatchState) {
  1066. /* Prefetch DMS rows */
  1067. U32* const dmsHashTable = dms->hashTable;
  1068. BYTE* const dmsTagTable = dms->tagTable;
  1069. U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
  1070. U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  1071. dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
  1072. dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
  1073. dmsRow = dmsHashTable + dmsRelRow;
  1074. ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
  1075. }
  1076. /* Update the hashTable and tagTable up to (but not including) ip */
  1077. if (!ms->lazySkipping) {
  1078. ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
  1079. hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
  1080. } else {
  1081. /* Stop inserting every position when in the lazy skipping mode.
  1082. * The hash cache is also not kept up to date in this mode.
  1083. */
  1084. hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
  1085. ms->nextToUpdate = curr;
  1086. }
  1087. ms->hashSaltEntropy += hash; /* collect salt entropy */
  1088. { /* Get the hash for ip, compute the appropriate row */
  1089. U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  1090. U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
  1091. U32* const row = hashTable + relRow;
  1092. BYTE* tagRow = (BYTE*)(tagTable + relRow);
  1093. U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
  1094. U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
  1095. size_t numMatches = 0;
  1096. size_t currMatch = 0;
  1097. ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
  1098. /* Cycle through the matches and prefetch */
  1099. for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
  1100. U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
  1101. U32 const matchIndex = row[matchPos];
  1102. if(matchPos == 0) continue;
  1103. assert(numMatches < rowEntries);
  1104. if (matchIndex < lowLimit)
  1105. break;
  1106. if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
  1107. PREFETCH_L1(base + matchIndex);
  1108. } else {
  1109. PREFETCH_L1(dictBase + matchIndex);
  1110. }
  1111. matchBuffer[numMatches++] = matchIndex;
  1112. --nbAttempts;
  1113. }
  1114. /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
  1115. in ZSTD_row_update_internal() at the next search. */
  1116. {
  1117. U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
  1118. tagRow[pos] = (BYTE)tag;
  1119. row[pos] = ms->nextToUpdate++;
  1120. }
  1121. /* Return the longest match */
  1122. for (; currMatch < numMatches; ++currMatch) {
  1123. U32 const matchIndex = matchBuffer[currMatch];
  1124. size_t currentMl=0;
  1125. assert(matchIndex < curr);
  1126. assert(matchIndex >= lowLimit);
  1127. if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
  1128. const BYTE* const match = base + matchIndex;
  1129. assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
  1130. /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
  1131. if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
  1132. currentMl = ZSTD_count(ip, match, iLimit);
  1133. } else {
  1134. const BYTE* const match = dictBase + matchIndex;
  1135. assert(match+4 <= dictEnd);
  1136. if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  1137. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
  1138. }
  1139. /* Save best solution */
  1140. if (currentMl > ml) {
  1141. ml = currentMl;
  1142. *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  1143. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  1144. }
  1145. }
  1146. }
  1147. assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
  1148. if (dictMode == ZSTD_dedicatedDictSearch) {
  1149. ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
  1150. ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
  1151. } else if (dictMode == ZSTD_dictMatchState) {
  1152. /* TODO: Measure and potentially add prefetching to DMS */
  1153. const U32 dmsLowestIndex = dms->window.dictLimit;
  1154. const BYTE* const dmsBase = dms->window.base;
  1155. const BYTE* const dmsEnd = dms->window.nextSrc;
  1156. const U32 dmsSize = (U32)(dmsEnd - dmsBase);
  1157. const U32 dmsIndexDelta = dictLimit - dmsSize;
  1158. { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
  1159. U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
  1160. size_t numMatches = 0;
  1161. size_t currMatch = 0;
  1162. ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
  1163. for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
  1164. U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
  1165. U32 const matchIndex = dmsRow[matchPos];
  1166. if(matchPos == 0) continue;
  1167. if (matchIndex < dmsLowestIndex)
  1168. break;
  1169. PREFETCH_L1(dmsBase + matchIndex);
  1170. matchBuffer[numMatches++] = matchIndex;
  1171. --nbAttempts;
  1172. }
  1173. /* Return the longest match */
  1174. for (; currMatch < numMatches; ++currMatch) {
  1175. U32 const matchIndex = matchBuffer[currMatch];
  1176. size_t currentMl=0;
  1177. assert(matchIndex >= dmsLowestIndex);
  1178. assert(matchIndex < curr);
  1179. { const BYTE* const match = dmsBase + matchIndex;
  1180. assert(match+4 <= dmsEnd);
  1181. if (MEM_read32(match) == MEM_read32(ip))
  1182. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
  1183. }
  1184. if (currentMl > ml) {
  1185. ml = currentMl;
  1186. assert(curr > matchIndex + dmsIndexDelta);
  1187. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
  1188. if (ip+currentMl == iLimit) break;
  1189. }
  1190. }
  1191. }
  1192. }
  1193. return ml;
  1194. }
  1195. /*
  1196. * Generate search functions templated on (dictMode, mls, rowLog).
  1197. * These functions are outlined for code size & compilation time.
  1198. * ZSTD_searchMax() dispatches to the correct implementation function.
  1199. *
  1200. * TODO: The start of the search function involves loading and calculating a
  1201. * bunch of constants from the ZSTD_MatchState_t. These computations could be
  1202. * done in an initialization function, and saved somewhere in the match state.
  1203. * Then we could pass a pointer to the saved state instead of the match state,
  1204. * and avoid duplicate computations.
  1205. *
  1206. * TODO: Move the match re-winding into searchMax. This improves compression
  1207. * ratio, and unlocks further simplifications with the next TODO.
  1208. *
  1209. * TODO: Try moving the repcode search into searchMax. After the re-winding
  1210. * and repcode search are in searchMax, there is no more logic in the match
  1211. * finder loop that requires knowledge about the dictMode. So we should be
  1212. * able to avoid force inlining it, and we can join the extDict loop with
  1213. * the single segment loop. It should go in searchMax instead of its own
  1214. * function to avoid having multiple virtual function calls per search.
  1215. */
  1216. #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
  1217. #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
  1218. #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
  1219. #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
  1220. #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
  1221. ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
  1222. ZSTD_MatchState_t* ms, \
  1223. const BYTE* ip, const BYTE* const iLimit, \
  1224. size_t* offBasePtr) \
  1225. { \
  1226. assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
  1227. return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
  1228. } \
  1229. #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
  1230. ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
  1231. ZSTD_MatchState_t* ms, \
  1232. const BYTE* ip, const BYTE* const iLimit, \
  1233. size_t* offsetPtr) \
  1234. { \
  1235. assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
  1236. return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
  1237. } \
  1238. #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
  1239. ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
  1240. ZSTD_MatchState_t* ms, \
  1241. const BYTE* ip, const BYTE* const iLimit, \
  1242. size_t* offsetPtr) \
  1243. { \
  1244. assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
  1245. assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
  1246. return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
  1247. } \
  1248. #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
  1249. X(dictMode, mls, 4) \
  1250. X(dictMode, mls, 5) \
  1251. X(dictMode, mls, 6)
  1252. #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
  1253. ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
  1254. ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
  1255. ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
  1256. #define ZSTD_FOR_EACH_MLS(X, dictMode) \
  1257. X(dictMode, 4) \
  1258. X(dictMode, 5) \
  1259. X(dictMode, 6)
  1260. #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
  1261. X(__VA_ARGS__, noDict) \
  1262. X(__VA_ARGS__, extDict) \
  1263. X(__VA_ARGS__, dictMatchState) \
  1264. X(__VA_ARGS__, dedicatedDictSearch)
  1265. /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
  1266. ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
  1267. /* Generate binary Tree search fns for each combination of (dictMode, mls) */
  1268. ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
  1269. /* Generate hash chain search fns for each combination of (dictMode, mls) */
  1270. ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
  1271. typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
  1272. #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
  1273. case mls: \
  1274. return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
  1275. #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
  1276. case mls: \
  1277. return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
  1278. #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
  1279. case rowLog: \
  1280. return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
  1281. #define ZSTD_SWITCH_MLS(X, dictMode) \
  1282. switch (mls) { \
  1283. ZSTD_FOR_EACH_MLS(X, dictMode) \
  1284. }
  1285. #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
  1286. case mls: \
  1287. switch (rowLog) { \
  1288. ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
  1289. } \
  1290. ZSTD_UNREACHABLE; \
  1291. break;
  1292. #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
  1293. switch (searchMethod) { \
  1294. case search_hashChain: \
  1295. ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
  1296. break; \
  1297. case search_binaryTree: \
  1298. ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
  1299. break; \
  1300. case search_rowHash: \
  1301. ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
  1302. break; \
  1303. } \
  1304. ZSTD_UNREACHABLE;
  1305. /*
  1306. * Searches for the longest match at @p ip.
  1307. * Dispatches to the correct implementation function based on the
  1308. * (searchMethod, dictMode, mls, rowLog). We use switch statements
  1309. * here instead of using an indirect function call through a function
  1310. * pointer because after Spectre and Meltdown mitigations, indirect
  1311. * function calls can be very costly, especially in the kernel.
  1312. *
  1313. * NOTE: dictMode and searchMethod should be templated, so those switch
  1314. * statements should be optimized out. Only the mls & rowLog switches
  1315. * should be left.
  1316. *
  1317. * @param ms The match state.
  1318. * @param ip The position to search at.
  1319. * @param iend The end of the input data.
  1320. * @param[out] offsetPtr Stores the match offset into this pointer.
  1321. * @param mls The minimum search length, in the range [4, 6].
  1322. * @param rowLog The row log (if applicable), in the range [4, 6].
  1323. * @param searchMethod The search method to use (templated).
  1324. * @param dictMode The dictMode (templated).
  1325. *
  1326. * @returns The length of the longest match found, or < mls if no match is found.
  1327. * If a match is found its offset is stored in @p offsetPtr.
  1328. */
  1329. FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
  1330. ZSTD_MatchState_t* ms,
  1331. const BYTE* ip,
  1332. const BYTE* iend,
  1333. size_t* offsetPtr,
  1334. U32 const mls,
  1335. U32 const rowLog,
  1336. searchMethod_e const searchMethod,
  1337. ZSTD_dictMode_e const dictMode)
  1338. {
  1339. if (dictMode == ZSTD_noDict) {
  1340. ZSTD_SWITCH_SEARCH_METHOD(noDict)
  1341. } else if (dictMode == ZSTD_extDict) {
  1342. ZSTD_SWITCH_SEARCH_METHOD(extDict)
  1343. } else if (dictMode == ZSTD_dictMatchState) {
  1344. ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
  1345. } else if (dictMode == ZSTD_dedicatedDictSearch) {
  1346. ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
  1347. }
  1348. ZSTD_UNREACHABLE;
  1349. return 0;
  1350. }
  1351. /* *******************************
  1352. * Common parser - lazy strategy
  1353. *********************************/
  1354. FORCE_INLINE_TEMPLATE
  1355. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  1356. size_t ZSTD_compressBlock_lazy_generic(
  1357. ZSTD_MatchState_t* ms, SeqStore_t* seqStore,
  1358. U32 rep[ZSTD_REP_NUM],
  1359. const void* src, size_t srcSize,
  1360. const searchMethod_e searchMethod, const U32 depth,
  1361. ZSTD_dictMode_e const dictMode)
  1362. {
  1363. const BYTE* const istart = (const BYTE*)src;
  1364. const BYTE* ip = istart;
  1365. const BYTE* anchor = istart;
  1366. const BYTE* const iend = istart + srcSize;
  1367. const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
  1368. const BYTE* const base = ms->window.base;
  1369. const U32 prefixLowestIndex = ms->window.dictLimit;
  1370. const BYTE* const prefixLowest = base + prefixLowestIndex;
  1371. const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
  1372. const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  1373. U32 offset_1 = rep[0], offset_2 = rep[1];
  1374. U32 offsetSaved1 = 0, offsetSaved2 = 0;
  1375. const int isDMS = dictMode == ZSTD_dictMatchState;
  1376. const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
  1377. const int isDxS = isDMS || isDDS;
  1378. const ZSTD_MatchState_t* const dms = ms->dictMatchState;
  1379. const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
  1380. const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
  1381. const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
  1382. const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
  1383. const U32 dictIndexDelta = isDxS ?
  1384. prefixLowestIndex - (U32)(dictEnd - dictBase) :
  1385. 0;
  1386. const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
  1387. DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
  1388. ip += (dictAndPrefixLength == 0);
  1389. if (dictMode == ZSTD_noDict) {
  1390. U32 const curr = (U32)(ip - base);
  1391. U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
  1392. U32 const maxRep = curr - windowLow;
  1393. if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
  1394. if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
  1395. }
  1396. if (isDxS) {
  1397. /* dictMatchState repCode checks don't currently handle repCode == 0
  1398. * disabling. */
  1399. assert(offset_1 <= dictAndPrefixLength);
  1400. assert(offset_2 <= dictAndPrefixLength);
  1401. }
  1402. /* Reset the lazy skipping state */
  1403. ms->lazySkipping = 0;
  1404. if (searchMethod == search_rowHash) {
  1405. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1406. }
  1407. /* Match Loop */
  1408. #if defined(__x86_64__)
  1409. /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
  1410. * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
  1411. */
  1412. __asm__(".p2align 5");
  1413. #endif
  1414. while (ip < ilimit) {
  1415. size_t matchLength=0;
  1416. size_t offBase = REPCODE1_TO_OFFBASE;
  1417. const BYTE* start=ip+1;
  1418. DEBUGLOG(7, "search baseline (depth 0)");
  1419. /* check repCode */
  1420. if (isDxS) {
  1421. const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
  1422. const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
  1423. && repIndex < prefixLowestIndex) ?
  1424. dictBase + (repIndex - dictIndexDelta) :
  1425. base + repIndex;
  1426. if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
  1427. && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
  1428. const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
  1429. matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
  1430. if (depth==0) goto _storeSequence;
  1431. }
  1432. }
  1433. if ( dictMode == ZSTD_noDict
  1434. && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
  1435. matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
  1436. if (depth==0) goto _storeSequence;
  1437. }
  1438. /* first search (depth 0) */
  1439. { size_t offbaseFound = 999999999;
  1440. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
  1441. if (ml2 > matchLength)
  1442. matchLength = ml2, start = ip, offBase = offbaseFound;
  1443. }
  1444. if (matchLength < 4) {
  1445. size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
  1446. ip += step;
  1447. /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
  1448. * In this mode we stop inserting every position into our tables, and only insert
  1449. * positions that we search, which is one in step positions.
  1450. * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
  1451. * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
  1452. * triggered once we've gone 2KB without finding any matches.
  1453. */
  1454. ms->lazySkipping = step > kLazySkippingStep;
  1455. continue;
  1456. }
  1457. /* let's try to find a better solution */
  1458. if (depth>=1)
  1459. while (ip<ilimit) {
  1460. DEBUGLOG(7, "search depth 1");
  1461. ip ++;
  1462. if ( (dictMode == ZSTD_noDict)
  1463. && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
  1464. size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
  1465. int const gain2 = (int)(mlRep * 3);
  1466. int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
  1467. if ((mlRep >= 4) && (gain2 > gain1))
  1468. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1469. }
  1470. if (isDxS) {
  1471. const U32 repIndex = (U32)(ip - base) - offset_1;
  1472. const BYTE* repMatch = repIndex < prefixLowestIndex ?
  1473. dictBase + (repIndex - dictIndexDelta) :
  1474. base + repIndex;
  1475. if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
  1476. && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
  1477. const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
  1478. size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
  1479. int const gain2 = (int)(mlRep * 3);
  1480. int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
  1481. if ((mlRep >= 4) && (gain2 > gain1))
  1482. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1483. }
  1484. }
  1485. { size_t ofbCandidate=999999999;
  1486. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
  1487. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1488. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
  1489. if ((ml2 >= 4) && (gain2 > gain1)) {
  1490. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1491. continue; /* search a better one */
  1492. } }
  1493. /* let's find an even better one */
  1494. if ((depth==2) && (ip<ilimit)) {
  1495. DEBUGLOG(7, "search depth 2");
  1496. ip ++;
  1497. if ( (dictMode == ZSTD_noDict)
  1498. && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
  1499. size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
  1500. int const gain2 = (int)(mlRep * 4);
  1501. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
  1502. if ((mlRep >= 4) && (gain2 > gain1))
  1503. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1504. }
  1505. if (isDxS) {
  1506. const U32 repIndex = (U32)(ip - base) - offset_1;
  1507. const BYTE* repMatch = repIndex < prefixLowestIndex ?
  1508. dictBase + (repIndex - dictIndexDelta) :
  1509. base + repIndex;
  1510. if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
  1511. && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
  1512. const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
  1513. size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
  1514. int const gain2 = (int)(mlRep * 4);
  1515. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
  1516. if ((mlRep >= 4) && (gain2 > gain1))
  1517. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1518. }
  1519. }
  1520. { size_t ofbCandidate=999999999;
  1521. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
  1522. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1523. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
  1524. if ((ml2 >= 4) && (gain2 > gain1)) {
  1525. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1526. continue;
  1527. } } }
  1528. break; /* nothing found : store previous solution */
  1529. }
  1530. /* NOTE:
  1531. * Pay attention that `start[-value]` can lead to strange undefined behavior
  1532. * notably if `value` is unsigned, resulting in a large positive `-value`.
  1533. */
  1534. /* catch up */
  1535. if (OFFBASE_IS_OFFSET(offBase)) {
  1536. if (dictMode == ZSTD_noDict) {
  1537. while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
  1538. && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
  1539. { start--; matchLength++; }
  1540. }
  1541. if (isDxS) {
  1542. U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
  1543. const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
  1544. const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
  1545. while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
  1546. }
  1547. offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
  1548. }
  1549. /* store sequence */
  1550. _storeSequence:
  1551. { size_t const litLength = (size_t)(start - anchor);
  1552. ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
  1553. anchor = ip = start + matchLength;
  1554. }
  1555. if (ms->lazySkipping) {
  1556. /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
  1557. if (searchMethod == search_rowHash) {
  1558. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1559. }
  1560. ms->lazySkipping = 0;
  1561. }
  1562. /* check immediate repcode */
  1563. if (isDxS) {
  1564. while (ip <= ilimit) {
  1565. U32 const current2 = (U32)(ip-base);
  1566. U32 const repIndex = current2 - offset_2;
  1567. const BYTE* repMatch = repIndex < prefixLowestIndex ?
  1568. dictBase - dictIndexDelta + repIndex :
  1569. base + repIndex;
  1570. if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex))
  1571. && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
  1572. const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
  1573. matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
  1574. offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
  1575. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
  1576. ip += matchLength;
  1577. anchor = ip;
  1578. continue;
  1579. }
  1580. break;
  1581. }
  1582. }
  1583. if (dictMode == ZSTD_noDict) {
  1584. while ( ((ip <= ilimit) & (offset_2>0))
  1585. && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
  1586. /* store sequence */
  1587. matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
  1588. offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
  1589. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
  1590. ip += matchLength;
  1591. anchor = ip;
  1592. continue; /* faster when present ... (?) */
  1593. } } }
  1594. /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
  1595. * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
  1596. offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
  1597. /* save reps for next block */
  1598. rep[0] = offset_1 ? offset_1 : offsetSaved1;
  1599. rep[1] = offset_2 ? offset_2 : offsetSaved2;
  1600. /* Return the last literals size */
  1601. return (size_t)(iend - anchor);
  1602. }
  1603. #endif /* build exclusions */
  1604. #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
  1605. size_t ZSTD_compressBlock_greedy(
  1606. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1607. void const* src, size_t srcSize)
  1608. {
  1609. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
  1610. }
  1611. size_t ZSTD_compressBlock_greedy_dictMatchState(
  1612. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1613. void const* src, size_t srcSize)
  1614. {
  1615. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
  1616. }
  1617. size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
  1618. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1619. void const* src, size_t srcSize)
  1620. {
  1621. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
  1622. }
  1623. size_t ZSTD_compressBlock_greedy_row(
  1624. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1625. void const* src, size_t srcSize)
  1626. {
  1627. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
  1628. }
  1629. size_t ZSTD_compressBlock_greedy_dictMatchState_row(
  1630. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1631. void const* src, size_t srcSize)
  1632. {
  1633. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
  1634. }
  1635. size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
  1636. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1637. void const* src, size_t srcSize)
  1638. {
  1639. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
  1640. }
  1641. #endif
  1642. #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
  1643. size_t ZSTD_compressBlock_lazy(
  1644. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1645. void const* src, size_t srcSize)
  1646. {
  1647. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
  1648. }
  1649. size_t ZSTD_compressBlock_lazy_dictMatchState(
  1650. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1651. void const* src, size_t srcSize)
  1652. {
  1653. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
  1654. }
  1655. size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
  1656. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1657. void const* src, size_t srcSize)
  1658. {
  1659. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
  1660. }
  1661. size_t ZSTD_compressBlock_lazy_row(
  1662. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1663. void const* src, size_t srcSize)
  1664. {
  1665. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
  1666. }
  1667. size_t ZSTD_compressBlock_lazy_dictMatchState_row(
  1668. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1669. void const* src, size_t srcSize)
  1670. {
  1671. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
  1672. }
  1673. size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
  1674. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1675. void const* src, size_t srcSize)
  1676. {
  1677. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
  1678. }
  1679. #endif
  1680. #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
  1681. size_t ZSTD_compressBlock_lazy2(
  1682. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1683. void const* src, size_t srcSize)
  1684. {
  1685. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
  1686. }
  1687. size_t ZSTD_compressBlock_lazy2_dictMatchState(
  1688. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1689. void const* src, size_t srcSize)
  1690. {
  1691. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
  1692. }
  1693. size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
  1694. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1695. void const* src, size_t srcSize)
  1696. {
  1697. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
  1698. }
  1699. size_t ZSTD_compressBlock_lazy2_row(
  1700. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1701. void const* src, size_t srcSize)
  1702. {
  1703. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
  1704. }
  1705. size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
  1706. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1707. void const* src, size_t srcSize)
  1708. {
  1709. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
  1710. }
  1711. size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
  1712. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1713. void const* src, size_t srcSize)
  1714. {
  1715. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
  1716. }
  1717. #endif
  1718. #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
  1719. size_t ZSTD_compressBlock_btlazy2(
  1720. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1721. void const* src, size_t srcSize)
  1722. {
  1723. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
  1724. }
  1725. size_t ZSTD_compressBlock_btlazy2_dictMatchState(
  1726. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1727. void const* src, size_t srcSize)
  1728. {
  1729. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
  1730. }
  1731. #endif
  1732. #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
  1733. || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
  1734. || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
  1735. || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
  1736. FORCE_INLINE_TEMPLATE
  1737. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  1738. size_t ZSTD_compressBlock_lazy_extDict_generic(
  1739. ZSTD_MatchState_t* ms, SeqStore_t* seqStore,
  1740. U32 rep[ZSTD_REP_NUM],
  1741. const void* src, size_t srcSize,
  1742. const searchMethod_e searchMethod, const U32 depth)
  1743. {
  1744. const BYTE* const istart = (const BYTE*)src;
  1745. const BYTE* ip = istart;
  1746. const BYTE* anchor = istart;
  1747. const BYTE* const iend = istart + srcSize;
  1748. const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
  1749. const BYTE* const base = ms->window.base;
  1750. const U32 dictLimit = ms->window.dictLimit;
  1751. const BYTE* const prefixStart = base + dictLimit;
  1752. const BYTE* const dictBase = ms->window.dictBase;
  1753. const BYTE* const dictEnd = dictBase + dictLimit;
  1754. const BYTE* const dictStart = dictBase + ms->window.lowLimit;
  1755. const U32 windowLog = ms->cParams.windowLog;
  1756. const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
  1757. const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  1758. U32 offset_1 = rep[0], offset_2 = rep[1];
  1759. DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
  1760. /* Reset the lazy skipping state */
  1761. ms->lazySkipping = 0;
  1762. /* init */
  1763. ip += (ip == prefixStart);
  1764. if (searchMethod == search_rowHash) {
  1765. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1766. }
  1767. /* Match Loop */
  1768. #if defined(__x86_64__)
  1769. /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
  1770. * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
  1771. */
  1772. __asm__(".p2align 5");
  1773. #endif
  1774. while (ip < ilimit) {
  1775. size_t matchLength=0;
  1776. size_t offBase = REPCODE1_TO_OFFBASE;
  1777. const BYTE* start=ip+1;
  1778. U32 curr = (U32)(ip-base);
  1779. /* check repCode */
  1780. { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
  1781. const U32 repIndex = (U32)(curr+1 - offset_1);
  1782. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1783. const BYTE* const repMatch = repBase + repIndex;
  1784. if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
  1785. & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
  1786. if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
  1787. /* repcode detected we should take it */
  1788. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1789. matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1790. if (depth==0) goto _storeSequence;
  1791. } }
  1792. /* first search (depth 0) */
  1793. { size_t ofbCandidate = 999999999;
  1794. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
  1795. if (ml2 > matchLength)
  1796. matchLength = ml2, start = ip, offBase = ofbCandidate;
  1797. }
  1798. if (matchLength < 4) {
  1799. size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
  1800. ip += step + 1; /* jump faster over incompressible sections */
  1801. /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
  1802. * In this mode we stop inserting every position into our tables, and only insert
  1803. * positions that we search, which is one in step positions.
  1804. * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
  1805. * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
  1806. * triggered once we've gone 2KB without finding any matches.
  1807. */
  1808. ms->lazySkipping = step > kLazySkippingStep;
  1809. continue;
  1810. }
  1811. /* let's try to find a better solution */
  1812. if (depth>=1)
  1813. while (ip<ilimit) {
  1814. ip ++;
  1815. curr++;
  1816. /* check repCode */
  1817. if (offBase) {
  1818. const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
  1819. const U32 repIndex = (U32)(curr - offset_1);
  1820. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1821. const BYTE* const repMatch = repBase + repIndex;
  1822. if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
  1823. & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
  1824. if (MEM_read32(ip) == MEM_read32(repMatch)) {
  1825. /* repcode detected */
  1826. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1827. size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1828. int const gain2 = (int)(repLength * 3);
  1829. int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
  1830. if ((repLength >= 4) && (gain2 > gain1))
  1831. matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1832. } }
  1833. /* search match, depth 1 */
  1834. { size_t ofbCandidate = 999999999;
  1835. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
  1836. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1837. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
  1838. if ((ml2 >= 4) && (gain2 > gain1)) {
  1839. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1840. continue; /* search a better one */
  1841. } }
  1842. /* let's find an even better one */
  1843. if ((depth==2) && (ip<ilimit)) {
  1844. ip ++;
  1845. curr++;
  1846. /* check repCode */
  1847. if (offBase) {
  1848. const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
  1849. const U32 repIndex = (U32)(curr - offset_1);
  1850. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1851. const BYTE* const repMatch = repBase + repIndex;
  1852. if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
  1853. & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
  1854. if (MEM_read32(ip) == MEM_read32(repMatch)) {
  1855. /* repcode detected */
  1856. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1857. size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1858. int const gain2 = (int)(repLength * 4);
  1859. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
  1860. if ((repLength >= 4) && (gain2 > gain1))
  1861. matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1862. } }
  1863. /* search match, depth 2 */
  1864. { size_t ofbCandidate = 999999999;
  1865. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
  1866. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1867. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
  1868. if ((ml2 >= 4) && (gain2 > gain1)) {
  1869. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1870. continue;
  1871. } } }
  1872. break; /* nothing found : store previous solution */
  1873. }
  1874. /* catch up */
  1875. if (OFFBASE_IS_OFFSET(offBase)) {
  1876. U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
  1877. const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
  1878. const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
  1879. while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
  1880. offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
  1881. }
  1882. /* store sequence */
  1883. _storeSequence:
  1884. { size_t const litLength = (size_t)(start - anchor);
  1885. ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
  1886. anchor = ip = start + matchLength;
  1887. }
  1888. if (ms->lazySkipping) {
  1889. /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
  1890. if (searchMethod == search_rowHash) {
  1891. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1892. }
  1893. ms->lazySkipping = 0;
  1894. }
  1895. /* check immediate repcode */
  1896. while (ip <= ilimit) {
  1897. const U32 repCurrent = (U32)(ip-base);
  1898. const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
  1899. const U32 repIndex = repCurrent - offset_2;
  1900. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1901. const BYTE* const repMatch = repBase + repIndex;
  1902. if ( (ZSTD_index_overlap_check(dictLimit, repIndex))
  1903. & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
  1904. if (MEM_read32(ip) == MEM_read32(repMatch)) {
  1905. /* repcode detected we should take it */
  1906. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1907. matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1908. offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
  1909. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
  1910. ip += matchLength;
  1911. anchor = ip;
  1912. continue; /* faster when present ... (?) */
  1913. }
  1914. break;
  1915. } }
  1916. /* Save reps for next block */
  1917. rep[0] = offset_1;
  1918. rep[1] = offset_2;
  1919. /* Return the last literals size */
  1920. return (size_t)(iend - anchor);
  1921. }
  1922. #endif /* build exclusions */
  1923. #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
  1924. size_t ZSTD_compressBlock_greedy_extDict(
  1925. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1926. void const* src, size_t srcSize)
  1927. {
  1928. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
  1929. }
  1930. size_t ZSTD_compressBlock_greedy_extDict_row(
  1931. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1932. void const* src, size_t srcSize)
  1933. {
  1934. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
  1935. }
  1936. #endif
  1937. #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
  1938. size_t ZSTD_compressBlock_lazy_extDict(
  1939. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1940. void const* src, size_t srcSize)
  1941. {
  1942. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
  1943. }
  1944. size_t ZSTD_compressBlock_lazy_extDict_row(
  1945. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1946. void const* src, size_t srcSize)
  1947. {
  1948. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
  1949. }
  1950. #endif
  1951. #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
  1952. size_t ZSTD_compressBlock_lazy2_extDict(
  1953. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1954. void const* src, size_t srcSize)
  1955. {
  1956. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
  1957. }
  1958. size_t ZSTD_compressBlock_lazy2_extDict_row(
  1959. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1960. void const* src, size_t srcSize)
  1961. {
  1962. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
  1963. }
  1964. #endif
  1965. #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
  1966. size_t ZSTD_compressBlock_btlazy2_extDict(
  1967. ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1968. void const* src, size_t srcSize)
  1969. {
  1970. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
  1971. }
  1972. #endif