Line data Source code
1 : /**
2 : * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3 : * All rights reserved.
4 : *
5 : * This source code is licensed under the BSD-style license found in the
6 : * LICENSE file in the root directory of this source tree. An additional grant
7 : * of patent rights can be found in the PATENTS file in the same directory.
8 : */
9 :
10 :
11 : /*-**************************************
12 : * Tuning parameters
13 : ****************************************/
14 : #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15 : #define ZDICT_MIN_SAMPLES_SIZE 512
16 :
17 :
18 : /*-**************************************
19 : * Compiler Options
20 : ****************************************/
21 : /* Unix Large Files support (>4GB) */
22 : #define _FILE_OFFSET_BITS 64
23 : #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
24 : # define _LARGEFILE_SOURCE
25 : #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
26 : # define _LARGEFILE64_SOURCE
27 : #endif
28 :
29 :
30 : /*-*************************************
31 : * Dependencies
32 : ***************************************/
33 : #include <stdlib.h> /* malloc, free */
34 : #include <string.h> /* memset */
35 : #include <stdio.h> /* fprintf, fopen, ftello64 */
36 : #include <time.h> /* clock */
37 :
38 : #include "mem.h" /* read */
39 : #include "error_private.h"
40 : #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
41 : #define HUF_STATIC_LINKING_ONLY
42 : #include "huf.h"
43 : #include "zstd_internal.h" /* includes zstd.h */
44 : #include "xxhash.h"
45 : #include "divsufsort.h"
46 : #ifndef ZDICT_STATIC_LINKING_ONLY
47 : # define ZDICT_STATIC_LINKING_ONLY
48 : #endif
49 : #include "zdict.h"
50 :
51 :
52 : /*-*************************************
53 : * Constants
54 : ***************************************/
55 : #define KB *(1 <<10)
56 : #define MB *(1 <<20)
57 : #define GB *(1U<<30)
58 :
59 : #define DICTLISTSIZE_DEFAULT 10000
60 :
61 : #define NOISELENGTH 32
62 :
63 : #define MINRATIO 4
64 : static const int g_compressionLevel_default = 5;
65 : static const U32 g_selectivity_default = 9;
66 : static const size_t g_provision_entropySize = 200;
67 : static const size_t g_min_fast_dictContent = 192;
68 :
69 :
70 : /*-*************************************
71 : * Console display
72 : ***************************************/
73 : #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
74 : #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
75 :
76 0 : static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
77 :
78 0 : static void ZDICT_printHex(const void* ptr, size_t length)
79 : {
80 0 : const BYTE* const b = (const BYTE*)ptr;
81 : size_t u;
82 0 : for (u=0; u<length; u++) {
83 0 : BYTE c = b[u];
84 0 : if (c<32 || c>126) c = '.'; /* non-printable char */
85 0 : DISPLAY("%c", c);
86 : }
87 0 : }
88 :
89 :
90 : /*-********************************************************
91 : * Helper functions
92 : **********************************************************/
93 0 : unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
94 :
95 0 : const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
96 :
97 0 : unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
98 : {
99 0 : if (dictSize < 8) return 0;
100 0 : if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
101 0 : return MEM_readLE32((const char*)dictBuffer + 4);
102 : }
103 :
104 :
105 : /*-********************************************************
106 : * Dictionary training functions
107 : **********************************************************/
108 0 : static unsigned ZDICT_NbCommonBytes (register size_t val)
109 : {
110 0 : if (MEM_isLittleEndian()) {
111 0 : if (MEM_64bits()) {
112 : # if defined(_MSC_VER) && defined(_WIN64)
113 : unsigned long r = 0;
114 : _BitScanForward64( &r, (U64)val );
115 : return (unsigned)(r>>3);
116 : # elif defined(__GNUC__) && (__GNUC__ >= 3)
117 0 : return (__builtin_ctzll((U64)val) >> 3);
118 : # else
119 : static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
120 : return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
121 : # endif
122 : } else { /* 32 bits */
123 : # if defined(_MSC_VER)
124 : unsigned long r=0;
125 : _BitScanForward( &r, (U32)val );
126 : return (unsigned)(r>>3);
127 : # elif defined(__GNUC__) && (__GNUC__ >= 3)
128 0 : return (__builtin_ctz((U32)val) >> 3);
129 : # else
130 : static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
131 : return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
132 : # endif
133 : }
134 : } else { /* Big Endian CPU */
135 0 : if (MEM_64bits()) {
136 : # if defined(_MSC_VER) && defined(_WIN64)
137 : unsigned long r = 0;
138 : _BitScanReverse64( &r, val );
139 : return (unsigned)(r>>3);
140 : # elif defined(__GNUC__) && (__GNUC__ >= 3)
141 0 : return (__builtin_clzll(val) >> 3);
142 : # else
143 : unsigned r;
144 : const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
145 : if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
146 : if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
147 : r += (!val);
148 : return r;
149 : # endif
150 : } else { /* 32 bits */
151 : # if defined(_MSC_VER)
152 : unsigned long r = 0;
153 : _BitScanReverse( &r, (unsigned long)val );
154 : return (unsigned)(r>>3);
155 : # elif defined(__GNUC__) && (__GNUC__ >= 3)
156 0 : return (__builtin_clz((U32)val) >> 3);
157 : # else
158 : unsigned r;
159 : if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
160 : r += (!val);
161 : return r;
162 : # endif
163 : } }
164 : }
165 :
166 :
167 : /*! ZDICT_count() :
168 : Count the nb of common bytes between 2 pointers.
169 : Note : this function presumes end of buffer followed by noisy guard band.
170 : */
171 0 : static size_t ZDICT_count(const void* pIn, const void* pMatch)
172 : {
173 0 : const char* const pStart = (const char*)pIn;
174 0 : for (;;) {
175 0 : size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
176 0 : if (!diff) {
177 0 : pIn = (const char*)pIn+sizeof(size_t);
178 0 : pMatch = (const char*)pMatch+sizeof(size_t);
179 0 : continue;
180 : }
181 0 : pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
182 0 : return (size_t)((const char*)pIn - pStart);
183 : }
184 : }
185 :
186 :
187 : typedef struct {
188 : U32 pos;
189 : U32 length;
190 : U32 savings;
191 : } dictItem;
192 :
193 0 : static void ZDICT_initDictItem(dictItem* d)
194 : {
195 0 : d->pos = 1;
196 0 : d->length = 0;
197 0 : d->savings = (U32)(-1);
198 0 : }
199 :
200 :
201 : #define LLIMIT 64 /* heuristic determined experimentally */
202 : #define MINMATCHLENGTH 7 /* heuristic determined experimentally */
203 0 : static dictItem ZDICT_analyzePos(
204 : BYTE* doneMarks,
205 : const int* suffix, U32 start,
206 : const void* buffer, U32 minRatio, U32 notificationLevel)
207 : {
208 0 : U32 lengthList[LLIMIT] = {0};
209 0 : U32 cumulLength[LLIMIT] = {0};
210 0 : U32 savings[LLIMIT] = {0};
211 0 : const BYTE* b = (const BYTE*)buffer;
212 : size_t length;
213 0 : size_t maxLength = LLIMIT;
214 0 : size_t pos = suffix[start];
215 0 : U32 end = start;
216 : dictItem solution;
217 :
218 : /* init */
219 0 : memset(&solution, 0, sizeof(solution));
220 0 : doneMarks[pos] = 1;
221 :
222 : /* trivial repetition cases */
223 0 : if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
224 0 : ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
225 0 : ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
226 : /* skip and mark segment */
227 0 : U16 u16 = MEM_read16(b+pos+4);
228 0 : U32 u, e = 6;
229 0 : while (MEM_read16(b+pos+e) == u16) e+=2 ;
230 0 : if (b[pos+e] == b[pos+e-1]) e++;
231 0 : for (u=1; u<e; u++)
232 0 : doneMarks[pos+u] = 1;
233 0 : return solution;
234 : }
235 :
236 : /* look forward */
237 : do {
238 0 : end++;
239 0 : length = ZDICT_count(b + pos, b + suffix[end]);
240 0 : } while (length >=MINMATCHLENGTH);
241 :
242 : /* look backward */
243 : do {
244 0 : length = ZDICT_count(b + pos, b + *(suffix+start-1));
245 0 : if (length >=MINMATCHLENGTH) start--;
246 0 : } while(length >= MINMATCHLENGTH);
247 :
248 : /* exit if not found a minimum nb of repetitions */
249 0 : if (end-start < minRatio) {
250 : U32 idx;
251 0 : for(idx=start; idx<end; idx++)
252 0 : doneMarks[suffix[idx]] = 1;
253 0 : return solution;
254 : }
255 :
256 : { int i;
257 : U32 searchLength;
258 0 : U32 refinedStart = start;
259 0 : U32 refinedEnd = end;
260 :
261 0 : DISPLAYLEVEL(4, "\n");
262 0 : DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
263 0 : DISPLAYLEVEL(4, "\n");
264 :
265 0 : for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
266 0 : BYTE currentChar = 0;
267 0 : U32 currentCount = 0;
268 0 : U32 currentID = refinedStart;
269 : U32 id;
270 0 : U32 selectedCount = 0;
271 0 : U32 selectedID = currentID;
272 0 : for (id =refinedStart; id < refinedEnd; id++) {
273 0 : if (b[ suffix[id] + searchLength] != currentChar) {
274 0 : if (currentCount > selectedCount) {
275 0 : selectedCount = currentCount;
276 0 : selectedID = currentID;
277 : }
278 0 : currentID = id;
279 0 : currentChar = b[ suffix[id] + searchLength];
280 0 : currentCount = 0;
281 : }
282 0 : currentCount ++;
283 : }
284 0 : if (currentCount > selectedCount) { /* for last */
285 0 : selectedCount = currentCount;
286 0 : selectedID = currentID;
287 : }
288 :
289 0 : if (selectedCount < minRatio)
290 0 : break;
291 0 : refinedStart = selectedID;
292 0 : refinedEnd = refinedStart + selectedCount;
293 : }
294 :
295 : /* evaluate gain based on new ref */
296 0 : start = refinedStart;
297 0 : pos = suffix[refinedStart];
298 0 : end = start;
299 0 : memset(lengthList, 0, sizeof(lengthList));
300 :
301 : /* look forward */
302 : do {
303 0 : end++;
304 0 : length = ZDICT_count(b + pos, b + suffix[end]);
305 0 : if (length >= LLIMIT) length = LLIMIT-1;
306 0 : lengthList[length]++;
307 0 : } while (length >=MINMATCHLENGTH);
308 :
309 : /* look backward */
310 0 : length = MINMATCHLENGTH;
311 0 : while ((length >= MINMATCHLENGTH) & (start > 0)) {
312 0 : length = ZDICT_count(b + pos, b + suffix[start - 1]);
313 0 : if (length >= LLIMIT) length = LLIMIT - 1;
314 0 : lengthList[length]++;
315 0 : if (length >= MINMATCHLENGTH) start--;
316 : }
317 :
318 : /* largest useful length */
319 0 : memset(cumulLength, 0, sizeof(cumulLength));
320 0 : cumulLength[maxLength-1] = lengthList[maxLength-1];
321 0 : for (i=(int)(maxLength-2); i>=0; i--)
322 0 : cumulLength[i] = cumulLength[i+1] + lengthList[i];
323 :
324 0 : for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
325 0 : maxLength = i;
326 :
327 : /* reduce maxLength in case of final into repetitive data */
328 0 : { U32 l = (U32)maxLength;
329 0 : BYTE const c = b[pos + maxLength-1];
330 0 : while (b[pos+l-2]==c) l--;
331 0 : maxLength = l;
332 : }
333 0 : if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
334 :
335 : /* calculate savings */
336 0 : savings[5] = 0;
337 0 : for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
338 0 : savings[i] = savings[i-1] + (lengthList[i] * (i-3));
339 :
340 0 : DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
341 : (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
342 :
343 0 : solution.pos = (U32)pos;
344 0 : solution.length = (U32)maxLength;
345 0 : solution.savings = savings[maxLength];
346 :
347 : /* mark positions done */
348 : { U32 id;
349 0 : for (id=start; id<end; id++) {
350 : U32 p, pEnd;
351 0 : U32 const testedPos = suffix[id];
352 0 : if (testedPos == pos)
353 0 : length = solution.length;
354 : else {
355 0 : length = ZDICT_count(b+pos, b+testedPos);
356 0 : if (length > solution.length) length = solution.length;
357 : }
358 0 : pEnd = (U32)(testedPos + length);
359 0 : for (p=testedPos; p<pEnd; p++)
360 0 : doneMarks[p] = 1;
361 : } } }
362 :
363 0 : return solution;
364 : }
365 :
366 :
367 : /*! ZDICT_checkMerge
368 : check if dictItem can be merged, do it if possible
369 : @return : id of destination elt, 0 if not merged
370 : */
371 0 : static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
372 : {
373 0 : const U32 tableSize = table->pos;
374 0 : const U32 max = elt.pos + (elt.length-1);
375 :
376 : /* tail overlap */
377 0 : U32 u; for (u=1; u<tableSize; u++) {
378 0 : if (u==eltNbToSkip) continue;
379 0 : if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
380 : /* append */
381 0 : U32 addedLength = table[u].pos - elt.pos;
382 0 : table[u].length += addedLength;
383 0 : table[u].pos = elt.pos;
384 0 : table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
385 0 : table[u].savings += elt.length / 8; /* rough approx */
386 0 : elt = table[u];
387 0 : while ((u>1) && (table[u-1].savings < elt.savings))
388 0 : table[u] = table[u-1], u--;
389 0 : table[u] = elt;
390 0 : return u;
391 : } }
392 :
393 : /* front overlap */
394 0 : for (u=1; u<tableSize; u++) {
395 0 : if (u==eltNbToSkip) continue;
396 0 : if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
397 : /* append */
398 0 : int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
399 0 : table[u].savings += elt.length / 8; /* rough approx */
400 0 : if (addedLength > 0) { /* otherwise, already included */
401 0 : table[u].length += addedLength;
402 0 : table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
403 : }
404 0 : elt = table[u];
405 0 : while ((u>1) && (table[u-1].savings < elt.savings))
406 0 : table[u] = table[u-1], u--;
407 0 : table[u] = elt;
408 0 : return u;
409 : } }
410 :
411 0 : return 0;
412 : }
413 :
414 :
415 0 : static void ZDICT_removeDictItem(dictItem* table, U32 id)
416 : {
417 : /* convention : first element is nb of elts */
418 0 : U32 const max = table->pos;
419 : U32 u;
420 0 : if (!id) return; /* protection, should never happen */
421 0 : for (u=id; u<max-1; u++)
422 0 : table[u] = table[u+1];
423 0 : table->pos--;
424 : }
425 :
426 :
427 0 : static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
428 : {
429 : /* merge if possible */
430 0 : U32 mergeId = ZDICT_checkMerge(table, elt, 0);
431 0 : if (mergeId) {
432 0 : U32 newMerge = 1;
433 0 : while (newMerge) {
434 0 : newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
435 0 : if (newMerge) ZDICT_removeDictItem(table, mergeId);
436 0 : mergeId = newMerge;
437 : }
438 0 : return;
439 : }
440 :
441 : /* insert */
442 : { U32 current;
443 0 : U32 nextElt = table->pos;
444 0 : if (nextElt >= maxSize) nextElt = maxSize-1;
445 0 : current = nextElt-1;
446 0 : while (table[current].savings < elt.savings) {
447 0 : table[current+1] = table[current];
448 0 : current--;
449 : }
450 0 : table[current+1] = elt;
451 0 : table->pos = nextElt+1;
452 : }
453 : }
454 :
455 :
456 0 : static U32 ZDICT_dictSize(const dictItem* dictList)
457 : {
458 0 : U32 u, dictSize = 0;
459 0 : for (u=1; u<dictList[0].pos; u++)
460 0 : dictSize += dictList[u].length;
461 0 : return dictSize;
462 : }
463 :
464 :
465 0 : static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
466 : const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
467 : const size_t* fileSizes, unsigned nbFiles,
468 : U32 minRatio, U32 notificationLevel)
469 : {
470 0 : int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
471 0 : int* const suffix = suffix0+1;
472 0 : U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
473 0 : BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
474 0 : U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
475 0 : size_t result = 0;
476 0 : clock_t displayClock = 0;
477 0 : clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
478 :
479 : # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
480 : if (ZDICT_clockSpan(displayClock) > refreshRate) \
481 : { displayClock = clock(); DISPLAY(__VA_ARGS__); \
482 : if (notificationLevel>=4) fflush(stdout); } }
483 :
484 : /* init */
485 0 : DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
486 0 : if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
487 0 : result = ERROR(memory_allocation);
488 0 : goto _cleanup;
489 : }
490 0 : if (minRatio < MINRATIO) minRatio = MINRATIO;
491 0 : memset(doneMarks, 0, bufferSize+16);
492 :
493 : /* limit sample set size (divsufsort limitation)*/
494 0 : if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
495 0 : while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
496 :
497 : /* sort */
498 0 : DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
499 0 : { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
500 0 : if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
501 : }
502 0 : suffix[bufferSize] = (int)bufferSize; /* leads into noise */
503 0 : suffix0[0] = (int)bufferSize; /* leads into noise */
504 : /* build reverse suffix sort */
505 : { size_t pos;
506 0 : for (pos=0; pos < bufferSize; pos++)
507 0 : reverseSuffix[suffix[pos]] = (U32)pos;
508 : /* note filePos tracks borders between samples.
509 : It's not used at this stage, but planned to become useful in a later update */
510 0 : filePos[0] = 0;
511 0 : for (pos=1; pos<nbFiles; pos++)
512 0 : filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
513 : }
514 :
515 0 : DISPLAYLEVEL(2, "finding patterns ... \n");
516 0 : DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
517 :
518 0 : { U32 cursor; for (cursor=0; cursor < bufferSize; ) {
519 : dictItem solution;
520 0 : if (doneMarks[cursor]) { cursor++; continue; }
521 0 : solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
522 0 : if (solution.length==0) { cursor++; continue; }
523 0 : ZDICT_insertDictItem(dictList, dictListSize, solution);
524 0 : cursor += solution.length;
525 0 : DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
526 : } }
527 :
528 : _cleanup:
529 0 : free(suffix0);
530 0 : free(reverseSuffix);
531 0 : free(doneMarks);
532 0 : free(filePos);
533 0 : return result;
534 : }
535 :
536 :
537 0 : static void ZDICT_fillNoise(void* buffer, size_t length)
538 : {
539 0 : unsigned const prime1 = 2654435761U;
540 0 : unsigned const prime2 = 2246822519U;
541 0 : unsigned acc = prime1;
542 0 : size_t p=0;;
543 0 : for (p=0; p<length; p++) {
544 0 : acc *= prime2;
545 0 : ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
546 : }
547 0 : }
548 :
549 :
550 : typedef struct
551 : {
552 : ZSTD_CCtx* ref;
553 : ZSTD_CCtx* zc;
554 : void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
555 : } EStats_ress_t;
556 :
557 : #define MAXREPOFFSET 1024
558 :
559 0 : static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
560 : U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
561 : const void* src, size_t srcSize, U32 notificationLevel)
562 : {
563 0 : size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
564 : size_t cSize;
565 :
566 0 : if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
567 0 : { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
568 0 : if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
569 : }
570 0 : cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
571 0 : if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
572 :
573 0 : if (cSize) { /* if == 0; block is not compressible */
574 0 : const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
575 :
576 : /* literals stats */
577 : { const BYTE* bytePtr;
578 0 : for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
579 0 : countLit[*bytePtr]++;
580 : }
581 :
582 : /* seqStats */
583 0 : { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
584 0 : ZSTD_seqToCodes(seqStorePtr);
585 :
586 0 : { const BYTE* codePtr = seqStorePtr->ofCode;
587 : U32 u;
588 0 : for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
589 : }
590 :
591 0 : { const BYTE* codePtr = seqStorePtr->mlCode;
592 : U32 u;
593 0 : for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
594 : }
595 :
596 0 : { const BYTE* codePtr = seqStorePtr->llCode;
597 : U32 u;
598 0 : for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
599 : }
600 :
601 0 : if (nbSeq >= 2) { /* rep offsets */
602 0 : const seqDef* const seq = seqStorePtr->sequencesStart;
603 0 : U32 offset1 = seq[0].offset - 3;
604 0 : U32 offset2 = seq[1].offset - 3;
605 0 : if (offset1 >= MAXREPOFFSET) offset1 = 0;
606 0 : if (offset2 >= MAXREPOFFSET) offset2 = 0;
607 0 : repOffsets[offset1] += 3;
608 0 : repOffsets[offset2] += 1;
609 : } } }
610 : }
611 :
612 : /*
613 : static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
614 : {
615 : unsigned u;
616 : size_t max=0;
617 : for (u=0; u<nbFiles; u++)
618 : if (max < fileSizes[u]) max = fileSizes[u];
619 : return max;
620 : }
621 : */
622 :
623 0 : static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
624 : {
625 0 : size_t total=0;
626 : unsigned u;
627 0 : for (u=0; u<nbFiles; u++) total += fileSizes[u];
628 0 : return total;
629 : }
630 :
631 : typedef struct { U32 offset; U32 count; } offsetCount_t;
632 :
633 0 : static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
634 : {
635 : U32 u;
636 0 : table[ZSTD_REP_NUM].offset = val;
637 0 : table[ZSTD_REP_NUM].count = count;
638 0 : for (u=ZSTD_REP_NUM; u>0; u--) {
639 : offsetCount_t tmp;
640 0 : if (table[u-1].count >= table[u].count) break;
641 0 : tmp = table[u-1];
642 0 : table[u-1] = table[u];
643 0 : table[u] = tmp;
644 : }
645 0 : }
646 :
647 :
648 : #define OFFCODE_MAX 30 /* only applicable to first block */
649 0 : static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
650 : unsigned compressionLevel,
651 : const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
652 : const void* dictBuffer, size_t dictBufferSize,
653 : unsigned notificationLevel)
654 : {
655 : U32 countLit[256];
656 0 : HUF_CREATE_STATIC_CTABLE(hufTable, 255);
657 : U32 offcodeCount[OFFCODE_MAX+1];
658 : short offcodeNCount[OFFCODE_MAX+1];
659 0 : U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
660 : U32 matchLengthCount[MaxML+1];
661 : short matchLengthNCount[MaxML+1];
662 : U32 litLengthCount[MaxLL+1];
663 : short litLengthNCount[MaxLL+1];
664 : U32 repOffset[MAXREPOFFSET];
665 : offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
666 : EStats_ress_t esr;
667 : ZSTD_parameters params;
668 0 : U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
669 0 : size_t pos = 0, errorCode;
670 0 : size_t eSize = 0;
671 0 : size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
672 0 : size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
673 0 : BYTE* dstPtr = (BYTE*)dstBuffer;
674 :
675 : /* init */
676 0 : esr.ref = ZSTD_createCCtx();
677 0 : esr.zc = ZSTD_createCCtx();
678 0 : esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
679 0 : if (!esr.ref || !esr.zc || !esr.workPlace) {
680 0 : eSize = ERROR(memory_allocation);
681 0 : DISPLAYLEVEL(1, "Not enough memory \n");
682 0 : goto _cleanup;
683 : }
684 0 : if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
685 0 : for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
686 0 : for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
687 0 : for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
688 0 : for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
689 0 : memset(repOffset, 0, sizeof(repOffset));
690 0 : repOffset[1] = repOffset[4] = repOffset[8] = 1;
691 0 : memset(bestRepOffset, 0, sizeof(bestRepOffset));
692 0 : if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
693 0 : params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
694 0 : { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
695 0 : if (ZSTD_isError(beginResult)) {
696 0 : eSize = ERROR(GENERIC);
697 0 : DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
698 0 : goto _cleanup;
699 : } }
700 :
701 : /* collect stats on all files */
702 0 : for (u=0; u<nbFiles; u++) {
703 0 : ZDICT_countEStats(esr, params,
704 : countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
705 0 : (const char*)srcBuffer + pos, fileSizes[u],
706 : notificationLevel);
707 0 : pos += fileSizes[u];
708 : }
709 :
710 : /* analyze */
711 0 : errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
712 0 : if (HUF_isError(errorCode)) {
713 0 : eSize = ERROR(GENERIC);
714 0 : DISPLAYLEVEL(1, "HUF_buildCTable error \n");
715 0 : goto _cleanup;
716 : }
717 0 : huffLog = (U32)errorCode;
718 :
719 : /* looking for most common first offsets */
720 : { U32 offset;
721 0 : for (offset=1; offset<MAXREPOFFSET; offset++)
722 0 : ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
723 : }
724 : /* note : the result of this phase should be used to better appreciate the impact on statistics */
725 :
726 0 : total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
727 0 : errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
728 0 : if (FSE_isError(errorCode)) {
729 0 : eSize = ERROR(GENERIC);
730 0 : DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
731 0 : goto _cleanup;
732 : }
733 0 : Offlog = (U32)errorCode;
734 :
735 0 : total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
736 0 : errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
737 0 : if (FSE_isError(errorCode)) {
738 0 : eSize = ERROR(GENERIC);
739 0 : DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
740 0 : goto _cleanup;
741 : }
742 0 : mlLog = (U32)errorCode;
743 :
744 0 : total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
745 0 : errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
746 0 : if (FSE_isError(errorCode)) {
747 0 : eSize = ERROR(GENERIC);
748 0 : DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
749 0 : goto _cleanup;
750 : }
751 0 : llLog = (U32)errorCode;
752 :
753 : /* write result to buffer */
754 0 : { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
755 0 : if (HUF_isError(hhSize)) {
756 0 : eSize = ERROR(GENERIC);
757 0 : DISPLAYLEVEL(1, "HUF_writeCTable error \n");
758 0 : goto _cleanup;
759 : }
760 0 : dstPtr += hhSize;
761 0 : maxDstSize -= hhSize;
762 0 : eSize += hhSize;
763 : }
764 :
765 0 : { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
766 0 : if (FSE_isError(ohSize)) {
767 0 : eSize = ERROR(GENERIC);
768 0 : DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
769 0 : goto _cleanup;
770 : }
771 0 : dstPtr += ohSize;
772 0 : maxDstSize -= ohSize;
773 0 : eSize += ohSize;
774 : }
775 :
776 0 : { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
777 0 : if (FSE_isError(mhSize)) {
778 0 : eSize = ERROR(GENERIC);
779 0 : DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
780 0 : goto _cleanup;
781 : }
782 0 : dstPtr += mhSize;
783 0 : maxDstSize -= mhSize;
784 0 : eSize += mhSize;
785 : }
786 :
787 0 : { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
788 0 : if (FSE_isError(lhSize)) {
789 0 : eSize = ERROR(GENERIC);
790 0 : DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
791 0 : goto _cleanup;
792 : }
793 0 : dstPtr += lhSize;
794 0 : maxDstSize -= lhSize;
795 0 : eSize += lhSize;
796 : }
797 :
798 0 : if (maxDstSize<12) {
799 0 : eSize = ERROR(GENERIC);
800 0 : DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
801 0 : goto _cleanup;
802 : }
803 : # if 0
804 : MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
805 : MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
806 : MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
807 : #else
808 : /* at this stage, we don't use the result of "most common first offset",
809 : as the impact of statistics is not properly evaluated */
810 0 : MEM_writeLE32(dstPtr+0, repStartValue[0]);
811 0 : MEM_writeLE32(dstPtr+4, repStartValue[1]);
812 0 : MEM_writeLE32(dstPtr+8, repStartValue[2]);
813 : #endif
814 0 : dstPtr += 12;
815 0 : eSize += 12;
816 :
817 : _cleanup:
818 0 : ZSTD_freeCCtx(esr.ref);
819 0 : ZSTD_freeCCtx(esr.zc);
820 0 : free(esr.workPlace);
821 :
822 0 : return eSize;
823 : }
824 :
825 :
826 0 : size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
827 : const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
828 : ZDICT_params_t params)
829 : {
830 : size_t hSize;
831 0 : int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
832 0 : U32 const notificationLevel = params.notificationLevel;
833 :
834 : /* dictionary header */
835 0 : MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
836 0 : { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
837 0 : U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
838 0 : U32 const dictID = params.dictID ? params.dictID : compliantID;
839 0 : MEM_writeLE32((char*)dictBuffer+4, dictID);
840 : }
841 0 : hSize = 8;
842 :
843 : /* entropy tables */
844 0 : DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
845 0 : DISPLAYLEVEL(2, "statistics ... \n");
846 0 : { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
847 : compressionLevel,
848 : samplesBuffer, samplesSizes, nbSamples,
849 0 : (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
850 : notificationLevel);
851 0 : if (ZDICT_isError(eSize)) return eSize;
852 0 : hSize += eSize;
853 : }
854 :
855 :
856 0 : if (hSize + dictContentSize < dictBufferCapacity)
857 0 : memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
858 0 : return MIN(dictBufferCapacity, hSize+dictContentSize);
859 : }
860 :
861 :
862 : /*! ZDICT_trainFromBuffer_unsafe() :
863 : * Warning : `samplesBuffer` must be followed by noisy guard band.
864 : * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
865 : */
866 0 : size_t ZDICT_trainFromBuffer_unsafe(
867 : void* dictBuffer, size_t maxDictSize,
868 : const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
869 : ZDICT_params_t params)
870 : {
871 0 : U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
872 0 : dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
873 0 : unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
874 0 : unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
875 0 : size_t const targetDictSize = maxDictSize;
876 0 : size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
877 0 : size_t dictSize = 0;
878 0 : U32 const notificationLevel = params.notificationLevel;
879 :
880 : /* checks */
881 0 : if (!dictList) return ERROR(memory_allocation);
882 0 : if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
883 0 : if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
884 :
885 : /* init */
886 0 : ZDICT_initDictItem(dictList);
887 :
888 : /* build dictionary */
889 0 : ZDICT_trainBuffer(dictList, dictListSize,
890 : samplesBuffer, samplesBuffSize,
891 : samplesSizes, nbSamples,
892 : minRep, notificationLevel);
893 :
894 : /* display best matches */
895 0 : if (params.notificationLevel>= 3) {
896 0 : U32 const nb = MIN(25, dictList[0].pos);
897 0 : U32 const dictContentSize = ZDICT_dictSize(dictList);
898 : U32 u;
899 0 : DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
900 0 : DISPLAYLEVEL(3, "list %u best segments \n", nb);
901 0 : for (u=1; u<=nb; u++) {
902 0 : U32 pos = dictList[u].pos;
903 0 : U32 length = dictList[u].length;
904 0 : U32 printedLength = MIN(40, length);
905 0 : DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
906 : u, length, pos, dictList[u].savings);
907 0 : ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
908 0 : DISPLAYLEVEL(3, "| \n");
909 : } }
910 :
911 :
912 : /* create dictionary */
913 0 : { U32 dictContentSize = ZDICT_dictSize(dictList);
914 0 : if (dictContentSize < targetDictSize/3) {
915 0 : DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
916 0 : if (minRep > MINRATIO) {
917 0 : DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
918 0 : DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
919 : }
920 0 : if (samplesBuffSize < 10 * targetDictSize)
921 0 : DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
922 : }
923 :
924 0 : if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
925 0 : U32 proposedSelectivity = selectivity-1;
926 0 : while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
927 0 : DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
928 0 : DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
929 0 : DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
930 : }
931 :
932 : /* limit dictionary size */
933 0 : { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
934 0 : U32 currentSize = 0;
935 0 : U32 n; for (n=1; n<max; n++) {
936 0 : currentSize += dictList[n].length;
937 0 : if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
938 : }
939 0 : dictList->pos = n;
940 0 : dictContentSize = currentSize;
941 : }
942 :
943 : /* build dict content */
944 : { U32 u;
945 0 : BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
946 0 : for (u=1; u<dictList->pos; u++) {
947 0 : U32 l = dictList[u].length;
948 0 : ptr -= l;
949 0 : if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */
950 0 : memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
951 : } }
952 :
953 0 : dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
954 : samplesBuffer, samplesSizes, nbSamples,
955 : params);
956 : }
957 :
958 : /* clean up */
959 0 : free(dictList);
960 0 : return dictSize;
961 : }
962 :
963 :
964 : /* issue : samplesBuffer need to be followed by a noisy guard band.
965 : * work around : duplicate the buffer, and add the noise */
966 0 : size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
967 : const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
968 : ZDICT_params_t params)
969 : {
970 : size_t result;
971 : void* newBuff;
972 0 : size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
973 0 : if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
974 :
975 0 : newBuff = malloc(sBuffSize + NOISELENGTH);
976 0 : if (!newBuff) return ERROR(memory_allocation);
977 :
978 0 : memcpy(newBuff, samplesBuffer, sBuffSize);
979 0 : ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
980 :
981 0 : result = ZDICT_trainFromBuffer_unsafe(
982 : dictBuffer, dictBufferCapacity,
983 : newBuff, samplesSizes, nbSamples,
984 : params);
985 0 : free(newBuff);
986 0 : return result;
987 : }
988 :
989 :
990 0 : size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
991 : const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
992 : {
993 : ZDICT_params_t params;
994 0 : memset(¶ms, 0, sizeof(params));
995 0 : return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
996 : samplesBuffer, samplesSizes, nbSamples,
997 : params);
998 : }
999 :
1000 0 : size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1001 : const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1002 : {
1003 : ZDICT_params_t params;
1004 0 : memset(¶ms, 0, sizeof(params));
1005 0 : return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1006 : samplesBuffer, samplesSizes, nbSamples,
1007 : params);
1008 : }
|