1 | /* |
---|
2 | dictBuilder - dictionary builder for zstd |
---|
3 | Copyright (C) Yann Collet 2016 |
---|
4 | |
---|
5 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) |
---|
6 | |
---|
7 | Redistribution and use in source and binary forms, with or without |
---|
8 | modification, are permitted provided that the following conditions are |
---|
9 | met: |
---|
10 | |
---|
11 | * Redistributions of source code must retain the above copyright |
---|
12 | notice, this list of conditions and the following disclaimer. |
---|
13 | * Redistributions in binary form must reproduce the above |
---|
14 | copyright notice, this list of conditions and the following disclaimer |
---|
15 | in the documentation and/or other materials provided with the |
---|
16 | distribution. |
---|
17 | |
---|
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
---|
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
---|
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
---|
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
---|
22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
---|
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
---|
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
---|
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
---|
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
---|
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
29 | |
---|
30 | You can contact the author at : |
---|
31 | - Zstd homepage : https://www.zstd.net |
---|
32 | */ |
---|
33 | |
---|
34 | /*-************************************** |
---|
35 | * Tuning parameters |
---|
36 | ****************************************/ |
---|
37 | #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) |
---|
38 | |
---|
39 | |
---|
40 | /*-************************************** |
---|
41 | * Compiler Options |
---|
42 | ****************************************/ |
---|
43 | /* Unix Large Files support (>4GB) */ |
---|
44 | #define _FILE_OFFSET_BITS 64 |
---|
45 | #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ |
---|
46 | # define _LARGEFILE_SOURCE |
---|
47 | #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ |
---|
48 | # define _LARGEFILE64_SOURCE |
---|
49 | #endif |
---|
50 | |
---|
51 | |
---|
52 | /*-************************************* |
---|
53 | * Dependencies |
---|
54 | ***************************************/ |
---|
55 | #include <stdlib.h> /* malloc, free */ |
---|
56 | #include <string.h> /* memset */ |
---|
57 | #include <stdio.h> /* fprintf, fopen, ftello64 */ |
---|
58 | #include <time.h> /* clock */ |
---|
59 | |
---|
60 | #include "mem.h" /* read */ |
---|
61 | #include "error_private.h" |
---|
62 | #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ |
---|
63 | #define HUF_STATIC_LINKING_ONLY |
---|
64 | #include "huf.h" |
---|
65 | #include "zstd_internal.h" /* includes zstd.h */ |
---|
66 | #include "xxhash.h" |
---|
67 | #include "divsufsort.h" |
---|
68 | #ifndef ZDICT_STATIC_LINKING_ONLY |
---|
69 | # define ZDICT_STATIC_LINKING_ONLY |
---|
70 | #endif |
---|
71 | #include "zdict.h" |
---|
72 | |
---|
73 | |
---|
74 | /*-************************************* |
---|
75 | * Constants |
---|
76 | ***************************************/ |
---|
77 | #define KB *(1 <<10) |
---|
78 | #define MB *(1 <<20) |
---|
79 | #define GB *(1U<<30) |
---|
80 | |
---|
81 | #define DICTLISTSIZE 10000 |
---|
82 | |
---|
83 | #define NOISELENGTH 32 |
---|
84 | #define PRIME1 2654435761U |
---|
85 | #define PRIME2 2246822519U |
---|
86 | |
---|
87 | #define MINRATIO 4 |
---|
88 | static const U32 g_compressionLevel_default = 5; |
---|
89 | static const U32 g_selectivity_default = 9; |
---|
90 | static const size_t g_provision_entropySize = 200; |
---|
91 | static const size_t g_min_fast_dictContent = 192; |
---|
92 | |
---|
93 | |
---|
94 | /*-************************************* |
---|
95 | * Console display |
---|
96 | ***************************************/ |
---|
97 | #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } |
---|
98 | #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } |
---|
99 | static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */ |
---|
100 | |
---|
101 | #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \ |
---|
102 | if (ZDICT_clockSpan(g_time) > refreshRate) \ |
---|
103 | { g_time = clock(); DISPLAY(__VA_ARGS__); \ |
---|
104 | if (g_displayLevel>=4) fflush(stdout); } } |
---|
105 | static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10; |
---|
106 | static clock_t g_time = 0; |
---|
107 | |
---|
108 | static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } |
---|
109 | |
---|
110 | static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length) |
---|
111 | { |
---|
112 | const BYTE* const b = (const BYTE*)ptr; |
---|
113 | size_t u; |
---|
114 | for (u=0; u<length; u++) { |
---|
115 | BYTE c = b[u]; |
---|
116 | if (c<32 || c>126) c = '.'; /* non-printable char */ |
---|
117 | DISPLAYLEVEL(dlevel, "%c", c); |
---|
118 | } |
---|
119 | } |
---|
120 | |
---|
121 | |
---|
122 | /*-******************************************************** |
---|
123 | * Helper functions |
---|
124 | **********************************************************/ |
---|
125 | unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } |
---|
126 | |
---|
127 | const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } |
---|
128 | |
---|
129 | |
---|
130 | /*-******************************************************** |
---|
131 | * Dictionary training functions |
---|
132 | **********************************************************/ |
---|
133 | static unsigned ZDICT_NbCommonBytes (register size_t val) |
---|
134 | { |
---|
135 | if (MEM_isLittleEndian()) { |
---|
136 | if (MEM_64bits()) { |
---|
137 | # if defined(_MSC_VER) && defined(_WIN64) |
---|
138 | unsigned long r = 0; |
---|
139 | _BitScanForward64( &r, (U64)val ); |
---|
140 | return (unsigned)(r>>3); |
---|
141 | # elif defined(__GNUC__) && (__GNUC__ >= 3) |
---|
142 | return (__builtin_ctzll((U64)val) >> 3); |
---|
143 | # else |
---|
144 | static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; |
---|
145 | return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; |
---|
146 | # endif |
---|
147 | } else { /* 32 bits */ |
---|
148 | # if defined(_MSC_VER) |
---|
149 | unsigned long r=0; |
---|
150 | _BitScanForward( &r, (U32)val ); |
---|
151 | return (unsigned)(r>>3); |
---|
152 | # elif defined(__GNUC__) && (__GNUC__ >= 3) |
---|
153 | return (__builtin_ctz((U32)val) >> 3); |
---|
154 | # else |
---|
155 | static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; |
---|
156 | return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; |
---|
157 | # endif |
---|
158 | } |
---|
159 | } else { /* Big Endian CPU */ |
---|
160 | if (MEM_64bits()) { |
---|
161 | # if defined(_MSC_VER) && defined(_WIN64) |
---|
162 | unsigned long r = 0; |
---|
163 | _BitScanReverse64( &r, val ); |
---|
164 | return (unsigned)(r>>3); |
---|
165 | # elif defined(__GNUC__) && (__GNUC__ >= 3) |
---|
166 | return (__builtin_clzll(val) >> 3); |
---|
167 | # else |
---|
168 | unsigned r; |
---|
169 | const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ |
---|
170 | if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } |
---|
171 | if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } |
---|
172 | r += (!val); |
---|
173 | return r; |
---|
174 | # endif |
---|
175 | } else { /* 32 bits */ |
---|
176 | # if defined(_MSC_VER) |
---|
177 | unsigned long r = 0; |
---|
178 | _BitScanReverse( &r, (unsigned long)val ); |
---|
179 | return (unsigned)(r>>3); |
---|
180 | # elif defined(__GNUC__) && (__GNUC__ >= 3) |
---|
181 | return (__builtin_clz((U32)val) >> 3); |
---|
182 | # else |
---|
183 | unsigned r; |
---|
184 | if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } |
---|
185 | r += (!val); |
---|
186 | return r; |
---|
187 | # endif |
---|
188 | } } |
---|
189 | } |
---|
190 | |
---|
191 | |
---|
192 | /*! ZDICT_count() : |
---|
193 | Count the nb of common bytes between 2 pointers. |
---|
194 | Note : this function presumes end of buffer followed by noisy guard band. |
---|
195 | */ |
---|
196 | static size_t ZDICT_count(const void* pIn, const void* pMatch) |
---|
197 | { |
---|
198 | const char* const pStart = (const char*)pIn; |
---|
199 | for (;;) { |
---|
200 | size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); |
---|
201 | if (!diff) { |
---|
202 | pIn = (const char*)pIn+sizeof(size_t); |
---|
203 | pMatch = (const char*)pMatch+sizeof(size_t); |
---|
204 | continue; |
---|
205 | } |
---|
206 | pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff); |
---|
207 | return (size_t)((const char*)pIn - pStart); |
---|
208 | } |
---|
209 | } |
---|
210 | |
---|
211 | |
---|
212 | typedef struct { |
---|
213 | U32 pos; |
---|
214 | U32 length; |
---|
215 | U32 savings; |
---|
216 | } dictItem; |
---|
217 | |
---|
218 | static void ZDICT_initDictItem(dictItem* d) |
---|
219 | { |
---|
220 | d->pos = 1; |
---|
221 | d->length = 0; |
---|
222 | d->savings = (U32)(-1); |
---|
223 | } |
---|
224 | |
---|
225 | |
---|
226 | #define LLIMIT 64 /* heuristic determined experimentally */ |
---|
227 | #define MINMATCHLENGTH 7 /* heuristic determined experimentally */ |
---|
228 | static dictItem ZDICT_analyzePos( |
---|
229 | BYTE* doneMarks, |
---|
230 | const int* suffix, U32 start, |
---|
231 | const void* buffer, U32 minRatio) |
---|
232 | { |
---|
233 | U32 lengthList[LLIMIT] = {0}; |
---|
234 | U32 cumulLength[LLIMIT] = {0}; |
---|
235 | U32 savings[LLIMIT] = {0}; |
---|
236 | const BYTE* b = (const BYTE*)buffer; |
---|
237 | size_t length; |
---|
238 | size_t maxLength = LLIMIT; |
---|
239 | size_t pos = suffix[start]; |
---|
240 | U32 end = start; |
---|
241 | dictItem solution; |
---|
242 | |
---|
243 | /* init */ |
---|
244 | memset(&solution, 0, sizeof(solution)); |
---|
245 | doneMarks[pos] = 1; |
---|
246 | |
---|
247 | /* trivial repetition cases */ |
---|
248 | if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) |
---|
249 | ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) |
---|
250 | ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { |
---|
251 | /* skip and mark segment */ |
---|
252 | U16 u16 = MEM_read16(b+pos+4); |
---|
253 | U32 u, e = 6; |
---|
254 | while (MEM_read16(b+pos+e) == u16) e+=2 ; |
---|
255 | if (b[pos+e] == b[pos+e-1]) e++; |
---|
256 | for (u=1; u<e; u++) |
---|
257 | doneMarks[pos+u] = 1; |
---|
258 | return solution; |
---|
259 | } |
---|
260 | |
---|
261 | /* look forward */ |
---|
262 | do { |
---|
263 | end++; |
---|
264 | length = ZDICT_count(b + pos, b + suffix[end]); |
---|
265 | } while (length >=MINMATCHLENGTH); |
---|
266 | |
---|
267 | /* look backward */ |
---|
268 | do { |
---|
269 | length = ZDICT_count(b + pos, b + *(suffix+start-1)); |
---|
270 | if (length >=MINMATCHLENGTH) start--; |
---|
271 | } while(length >= MINMATCHLENGTH); |
---|
272 | |
---|
273 | /* exit if not found a minimum nb of repetitions */ |
---|
274 | if (end-start < minRatio) { |
---|
275 | U32 idx; |
---|
276 | for(idx=start; idx<end; idx++) |
---|
277 | doneMarks[suffix[idx]] = 1; |
---|
278 | return solution; |
---|
279 | } |
---|
280 | |
---|
281 | { int i; |
---|
282 | U32 searchLength; |
---|
283 | U32 refinedStart = start; |
---|
284 | U32 refinedEnd = end; |
---|
285 | |
---|
286 | DISPLAYLEVEL(4, "\n"); |
---|
287 | DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos); |
---|
288 | DISPLAYLEVEL(4, "\n"); |
---|
289 | |
---|
290 | for (searchLength = MINMATCHLENGTH ; ; searchLength++) { |
---|
291 | BYTE currentChar = 0; |
---|
292 | U32 currentCount = 0; |
---|
293 | U32 currentID = refinedStart; |
---|
294 | U32 id; |
---|
295 | U32 selectedCount = 0; |
---|
296 | U32 selectedID = currentID; |
---|
297 | for (id =refinedStart; id < refinedEnd; id++) { |
---|
298 | if (b[ suffix[id] + searchLength] != currentChar) { |
---|
299 | if (currentCount > selectedCount) { |
---|
300 | selectedCount = currentCount; |
---|
301 | selectedID = currentID; |
---|
302 | } |
---|
303 | currentID = id; |
---|
304 | currentChar = b[ suffix[id] + searchLength]; |
---|
305 | currentCount = 0; |
---|
306 | } |
---|
307 | currentCount ++; |
---|
308 | } |
---|
309 | if (currentCount > selectedCount) { /* for last */ |
---|
310 | selectedCount = currentCount; |
---|
311 | selectedID = currentID; |
---|
312 | } |
---|
313 | |
---|
314 | if (selectedCount < minRatio) |
---|
315 | break; |
---|
316 | refinedStart = selectedID; |
---|
317 | refinedEnd = refinedStart + selectedCount; |
---|
318 | } |
---|
319 | |
---|
320 | /* evaluate gain based on new ref */ |
---|
321 | start = refinedStart; |
---|
322 | pos = suffix[refinedStart]; |
---|
323 | end = start; |
---|
324 | memset(lengthList, 0, sizeof(lengthList)); |
---|
325 | |
---|
326 | /* look forward */ |
---|
327 | do { |
---|
328 | end++; |
---|
329 | length = ZDICT_count(b + pos, b + suffix[end]); |
---|
330 | if (length >= LLIMIT) length = LLIMIT-1; |
---|
331 | lengthList[length]++; |
---|
332 | } while (length >=MINMATCHLENGTH); |
---|
333 | |
---|
334 | /* look backward */ |
---|
335 | do { |
---|
336 | length = ZDICT_count(b + pos, b + suffix[start-1]); |
---|
337 | if (length >= LLIMIT) length = LLIMIT-1; |
---|
338 | lengthList[length]++; |
---|
339 | if (length >=MINMATCHLENGTH) start--; |
---|
340 | } while(length >= MINMATCHLENGTH); |
---|
341 | |
---|
342 | /* largest useful length */ |
---|
343 | memset(cumulLength, 0, sizeof(cumulLength)); |
---|
344 | cumulLength[maxLength-1] = lengthList[maxLength-1]; |
---|
345 | for (i=(int)(maxLength-2); i>=0; i--) |
---|
346 | cumulLength[i] = cumulLength[i+1] + lengthList[i]; |
---|
347 | |
---|
348 | for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; |
---|
349 | maxLength = i; |
---|
350 | |
---|
351 | /* reduce maxLength in case of final into repetitive data */ |
---|
352 | { U32 l = (U32)maxLength; |
---|
353 | BYTE const c = b[pos + maxLength-1]; |
---|
354 | while (b[pos+l-2]==c) l--; |
---|
355 | maxLength = l; |
---|
356 | } |
---|
357 | if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ |
---|
358 | |
---|
359 | /* calculate savings */ |
---|
360 | savings[5] = 0; |
---|
361 | for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) |
---|
362 | savings[i] = savings[i-1] + (lengthList[i] * (i-3)); |
---|
363 | |
---|
364 | DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", |
---|
365 | (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); |
---|
366 | |
---|
367 | solution.pos = (U32)pos; |
---|
368 | solution.length = (U32)maxLength; |
---|
369 | solution.savings = savings[maxLength]; |
---|
370 | |
---|
371 | /* mark positions done */ |
---|
372 | { U32 id; |
---|
373 | for (id=start; id<end; id++) { |
---|
374 | U32 p, pEnd; |
---|
375 | U32 const testedPos = suffix[id]; |
---|
376 | if (testedPos == pos) |
---|
377 | length = solution.length; |
---|
378 | else { |
---|
379 | length = ZDICT_count(b+pos, b+testedPos); |
---|
380 | if (length > solution.length) length = solution.length; |
---|
381 | } |
---|
382 | pEnd = (U32)(testedPos + length); |
---|
383 | for (p=testedPos; p<pEnd; p++) |
---|
384 | doneMarks[p] = 1; |
---|
385 | } } } |
---|
386 | |
---|
387 | return solution; |
---|
388 | } |
---|
389 | |
---|
390 | |
---|
391 | /*! ZDICT_checkMerge |
---|
392 | check if dictItem can be merged, do it if possible |
---|
393 | @return : id of destination elt, 0 if not merged |
---|
394 | */ |
---|
395 | static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip) |
---|
396 | { |
---|
397 | const U32 tableSize = table->pos; |
---|
398 | const U32 max = elt.pos + (elt.length-1); |
---|
399 | |
---|
400 | /* tail overlap */ |
---|
401 | U32 u; for (u=1; u<tableSize; u++) { |
---|
402 | if (u==eltNbToSkip) continue; |
---|
403 | if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */ |
---|
404 | /* append */ |
---|
405 | U32 addedLength = table[u].pos - elt.pos; |
---|
406 | table[u].length += addedLength; |
---|
407 | table[u].pos = elt.pos; |
---|
408 | table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ |
---|
409 | table[u].savings += elt.length / 8; /* rough approx */ |
---|
410 | elt = table[u]; |
---|
411 | while ((u>1) && (table[u-1].savings < elt.savings)) |
---|
412 | table[u] = table[u-1], u--; |
---|
413 | table[u] = elt; |
---|
414 | return u; |
---|
415 | } } |
---|
416 | |
---|
417 | /* front overlap */ |
---|
418 | for (u=1; u<tableSize; u++) { |
---|
419 | if (u==eltNbToSkip) continue; |
---|
420 | if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */ |
---|
421 | /* append */ |
---|
422 | int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length); |
---|
423 | table[u].savings += elt.length / 8; /* rough approx */ |
---|
424 | if (addedLength > 0) { /* otherwise, already included */ |
---|
425 | table[u].length += addedLength; |
---|
426 | table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ |
---|
427 | } |
---|
428 | elt = table[u]; |
---|
429 | while ((u>1) && (table[u-1].savings < elt.savings)) |
---|
430 | table[u] = table[u-1], u--; |
---|
431 | table[u] = elt; |
---|
432 | return u; |
---|
433 | } } |
---|
434 | |
---|
435 | return 0; |
---|
436 | } |
---|
437 | |
---|
438 | |
---|
439 | static void ZDICT_removeDictItem(dictItem* table, U32 id) |
---|
440 | { |
---|
441 | /* convention : first element is nb of elts */ |
---|
442 | U32 const max = table->pos; |
---|
443 | U32 u; |
---|
444 | if (!id) return; /* protection, should never happen */ |
---|
445 | for (u=id; u<max-1; u++) |
---|
446 | table[u] = table[u+1]; |
---|
447 | table->pos--; |
---|
448 | } |
---|
449 | |
---|
450 | |
---|
451 | static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt) |
---|
452 | { |
---|
453 | /* merge if possible */ |
---|
454 | U32 mergeId = ZDICT_checkMerge(table, elt, 0); |
---|
455 | if (mergeId) { |
---|
456 | U32 newMerge = 1; |
---|
457 | while (newMerge) { |
---|
458 | newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId); |
---|
459 | if (newMerge) ZDICT_removeDictItem(table, mergeId); |
---|
460 | mergeId = newMerge; |
---|
461 | } |
---|
462 | return; |
---|
463 | } |
---|
464 | |
---|
465 | /* insert */ |
---|
466 | { U32 current; |
---|
467 | U32 nextElt = table->pos; |
---|
468 | if (nextElt >= maxSize) nextElt = maxSize-1; |
---|
469 | current = nextElt-1; |
---|
470 | while (table[current].savings < elt.savings) { |
---|
471 | table[current+1] = table[current]; |
---|
472 | current--; |
---|
473 | } |
---|
474 | table[current+1] = elt; |
---|
475 | table->pos = nextElt+1; |
---|
476 | } |
---|
477 | } |
---|
478 | |
---|
479 | |
---|
480 | static U32 ZDICT_dictSize(const dictItem* dictList) |
---|
481 | { |
---|
482 | U32 u, dictSize = 0; |
---|
483 | for (u=1; u<dictList[0].pos; u++) |
---|
484 | dictSize += dictList[u].length; |
---|
485 | return dictSize; |
---|
486 | } |
---|
487 | |
---|
488 | |
---|
489 | static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize, |
---|
490 | const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */ |
---|
491 | const size_t* fileSizes, unsigned nbFiles, |
---|
492 | U32 shiftRatio, unsigned maxDictSize) |
---|
493 | { |
---|
494 | int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0)); |
---|
495 | int* const suffix = suffix0+1; |
---|
496 | U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix)); |
---|
497 | BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */ |
---|
498 | U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos)); |
---|
499 | U32 minRatio = nbFiles >> shiftRatio; |
---|
500 | size_t result = 0; |
---|
501 | |
---|
502 | /* init */ |
---|
503 | DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ |
---|
504 | if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { |
---|
505 | result = ERROR(memory_allocation); |
---|
506 | goto _cleanup; |
---|
507 | } |
---|
508 | if (minRatio < MINRATIO) minRatio = MINRATIO; |
---|
509 | memset(doneMarks, 0, bufferSize+16); |
---|
510 | |
---|
511 | /* limit sample set size (divsufsort limitation)*/ |
---|
512 | if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20)); |
---|
513 | while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles]; |
---|
514 | |
---|
515 | /* sort */ |
---|
516 | DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20)); |
---|
517 | { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); |
---|
518 | if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } |
---|
519 | } |
---|
520 | suffix[bufferSize] = (int)bufferSize; /* leads into noise */ |
---|
521 | suffix0[0] = (int)bufferSize; /* leads into noise */ |
---|
522 | /* build reverse suffix sort */ |
---|
523 | { size_t pos; |
---|
524 | for (pos=0; pos < bufferSize; pos++) |
---|
525 | reverseSuffix[suffix[pos]] = (U32)pos; |
---|
526 | /* build file pos */ |
---|
527 | filePos[0] = 0; |
---|
528 | for (pos=1; pos<nbFiles; pos++) |
---|
529 | filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]); |
---|
530 | } |
---|
531 | |
---|
532 | DISPLAYLEVEL(2, "finding patterns ... \n"); |
---|
533 | DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio); |
---|
534 | |
---|
535 | { U32 cursor; for (cursor=0; cursor < bufferSize; ) { |
---|
536 | dictItem solution; |
---|
537 | if (doneMarks[cursor]) { cursor++; continue; } |
---|
538 | solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio); |
---|
539 | if (solution.length==0) { cursor++; continue; } |
---|
540 | ZDICT_insertDictItem(dictList, dictListSize, solution); |
---|
541 | cursor += solution.length; |
---|
542 | DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); |
---|
543 | } } |
---|
544 | |
---|
545 | /* limit dictionary size */ |
---|
546 | { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ |
---|
547 | U32 currentSize = 0; |
---|
548 | U32 n; for (n=1; n<max; n++) { |
---|
549 | currentSize += dictList[n].length; |
---|
550 | if (currentSize > maxDictSize) break; |
---|
551 | } |
---|
552 | dictList->pos = n; |
---|
553 | } |
---|
554 | |
---|
555 | _cleanup: |
---|
556 | free(suffix0); |
---|
557 | free(reverseSuffix); |
---|
558 | free(doneMarks); |
---|
559 | free(filePos); |
---|
560 | return result; |
---|
561 | } |
---|
562 | |
---|
563 | |
---|
564 | static void ZDICT_fillNoise(void* buffer, size_t length) |
---|
565 | { |
---|
566 | unsigned acc = PRIME1; |
---|
567 | size_t p=0;; |
---|
568 | for (p=0; p<length; p++) { |
---|
569 | acc *= PRIME2; |
---|
570 | ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); |
---|
571 | } |
---|
572 | } |
---|
573 | |
---|
574 | |
---|
575 | typedef struct |
---|
576 | { |
---|
577 | ZSTD_CCtx* ref; |
---|
578 | ZSTD_CCtx* zc; |
---|
579 | void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ |
---|
580 | } EStats_ress_t; |
---|
581 | |
---|
582 | #define MAXREPOFFSET 1024 |
---|
583 | |
---|
584 | static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, |
---|
585 | U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, |
---|
586 | const void* src, size_t srcSize) |
---|
587 | { |
---|
588 | size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); |
---|
589 | size_t cSize; |
---|
590 | |
---|
591 | if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ |
---|
592 | { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref); |
---|
593 | if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } |
---|
594 | } |
---|
595 | cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); |
---|
596 | if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; } |
---|
597 | |
---|
598 | if (cSize) { /* if == 0; block is not compressible */ |
---|
599 | const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); |
---|
600 | |
---|
601 | /* literals stats */ |
---|
602 | { const BYTE* bytePtr; |
---|
603 | for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) |
---|
604 | countLit[*bytePtr]++; |
---|
605 | } |
---|
606 | |
---|
607 | /* seqStats */ |
---|
608 | { size_t const nbSeq = (size_t)(seqStorePtr->offset - seqStorePtr->offsetStart); |
---|
609 | ZSTD_seqToCodes(seqStorePtr, nbSeq); |
---|
610 | |
---|
611 | { const BYTE* codePtr = seqStorePtr->offCodeStart; |
---|
612 | size_t u; |
---|
613 | for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++; |
---|
614 | } |
---|
615 | |
---|
616 | { const BYTE* codePtr = seqStorePtr->mlCodeStart; |
---|
617 | size_t u; |
---|
618 | for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++; |
---|
619 | } |
---|
620 | |
---|
621 | { const BYTE* codePtr = seqStorePtr->llCodeStart; |
---|
622 | size_t u; |
---|
623 | for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++; |
---|
624 | } } |
---|
625 | |
---|
626 | /* rep offsets */ |
---|
627 | { const U32* const offsetPtr = seqStorePtr->offsetStart; |
---|
628 | U32 offset1 = offsetPtr[0] - 3; |
---|
629 | U32 offset2 = offsetPtr[1] - 3; |
---|
630 | if (offset1 >= MAXREPOFFSET) offset1 = 0; |
---|
631 | if (offset2 >= MAXREPOFFSET) offset2 = 0; |
---|
632 | repOffsets[offset1] += 3; |
---|
633 | repOffsets[offset2] += 1; |
---|
634 | } |
---|
635 | } |
---|
636 | } |
---|
637 | |
---|
638 | /* |
---|
639 | static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles) |
---|
640 | { |
---|
641 | unsigned u; |
---|
642 | size_t max=0; |
---|
643 | for (u=0; u<nbFiles; u++) |
---|
644 | if (max < fileSizes[u]) max = fileSizes[u]; |
---|
645 | return max; |
---|
646 | } |
---|
647 | */ |
---|
648 | |
---|
649 | static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) |
---|
650 | { |
---|
651 | size_t total=0; |
---|
652 | unsigned u; |
---|
653 | for (u=0; u<nbFiles; u++) total += fileSizes[u]; |
---|
654 | return total; |
---|
655 | } |
---|
656 | |
---|
657 | typedef struct { U32 offset; U32 count; } offsetCount_t; |
---|
658 | |
---|
659 | static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count) |
---|
660 | { |
---|
661 | U32 u; |
---|
662 | table[ZSTD_REP_NUM].offset = val; |
---|
663 | table[ZSTD_REP_NUM].count = count; |
---|
664 | for (u=ZSTD_REP_NUM; u>0; u--) { |
---|
665 | offsetCount_t tmp; |
---|
666 | if (table[u-1].count >= table[u].count) break; |
---|
667 | tmp = table[u-1]; |
---|
668 | table[u-1] = table[u]; |
---|
669 | table[u] = tmp; |
---|
670 | } |
---|
671 | } |
---|
672 | |
---|
673 | |
---|
674 | #define OFFCODE_MAX 18 /* only applicable to first block */ |
---|
675 | static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, |
---|
676 | unsigned compressionLevel, |
---|
677 | const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, |
---|
678 | const void* dictBuffer, size_t dictBufferSize) |
---|
679 | { |
---|
680 | U32 countLit[256]; |
---|
681 | HUF_CREATE_STATIC_CTABLE(hufTable, 255); |
---|
682 | U32 offcodeCount[OFFCODE_MAX+1]; |
---|
683 | short offcodeNCount[OFFCODE_MAX+1]; |
---|
684 | U32 matchLengthCount[MaxML+1]; |
---|
685 | short matchLengthNCount[MaxML+1]; |
---|
686 | U32 litLengthCount[MaxLL+1]; |
---|
687 | short litLengthNCount[MaxLL+1]; |
---|
688 | U32 repOffset[MAXREPOFFSET] = { 0 }; |
---|
689 | offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; |
---|
690 | EStats_ress_t esr; |
---|
691 | ZSTD_parameters params; |
---|
692 | U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; |
---|
693 | size_t pos = 0, errorCode; |
---|
694 | size_t eSize = 0; |
---|
695 | size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); |
---|
696 | size_t const averageSampleSize = totalSrcSize / nbFiles; |
---|
697 | BYTE* dstPtr = (BYTE*)dstBuffer; |
---|
698 | |
---|
699 | /* init */ |
---|
700 | for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */ |
---|
701 | for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1; |
---|
702 | for (u=0; u<=MaxML; u++) matchLengthCount[u]=1; |
---|
703 | for (u=0; u<=MaxLL; u++) litLengthCount[u]=1; |
---|
704 | repOffset[1] = repOffset[4] = repOffset[8] = 1; |
---|
705 | memset(bestRepOffset, 0, sizeof(bestRepOffset)); |
---|
706 | esr.ref = ZSTD_createCCtx(); |
---|
707 | esr.zc = ZSTD_createCCtx(); |
---|
708 | esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); |
---|
709 | if (!esr.ref || !esr.zc || !esr.workPlace) { |
---|
710 | eSize = ERROR(memory_allocation); |
---|
711 | DISPLAYLEVEL(1, "Not enough memory"); |
---|
712 | goto _cleanup; |
---|
713 | } |
---|
714 | if (compressionLevel==0) compressionLevel=g_compressionLevel_default; |
---|
715 | params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); |
---|
716 | { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); |
---|
717 | if (ZSTD_isError(beginResult)) { |
---|
718 | eSize = ERROR(GENERIC); |
---|
719 | DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed "); |
---|
720 | goto _cleanup; |
---|
721 | } } |
---|
722 | |
---|
723 | /* collect stats on all files */ |
---|
724 | for (u=0; u<nbFiles; u++) { |
---|
725 | ZDICT_countEStats(esr, params, |
---|
726 | countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, |
---|
727 | (const char*)srcBuffer + pos, fileSizes[u]); |
---|
728 | pos += fileSizes[u]; |
---|
729 | } |
---|
730 | |
---|
731 | /* analyze */ |
---|
732 | errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); |
---|
733 | if (HUF_isError(errorCode)) { |
---|
734 | eSize = ERROR(GENERIC); |
---|
735 | DISPLAYLEVEL(1, "HUF_buildCTable error"); |
---|
736 | goto _cleanup; |
---|
737 | } |
---|
738 | huffLog = (U32)errorCode; |
---|
739 | |
---|
740 | /* looking for most common first offsets */ |
---|
741 | { U32 offset; |
---|
742 | for (offset=1; offset<MAXREPOFFSET; offset++) |
---|
743 | ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); |
---|
744 | } |
---|
745 | /* note : the result of this phase should be used to better appreciate the impact on statistics */ |
---|
746 | |
---|
747 | total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u]; |
---|
748 | errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX); |
---|
749 | if (FSE_isError(errorCode)) { |
---|
750 | eSize = ERROR(GENERIC); |
---|
751 | DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount"); |
---|
752 | goto _cleanup; |
---|
753 | } |
---|
754 | Offlog = (U32)errorCode; |
---|
755 | |
---|
756 | total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; |
---|
757 | errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML); |
---|
758 | if (FSE_isError(errorCode)) { |
---|
759 | eSize = ERROR(GENERIC); |
---|
760 | DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount"); |
---|
761 | goto _cleanup; |
---|
762 | } |
---|
763 | mlLog = (U32)errorCode; |
---|
764 | |
---|
765 | total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; |
---|
766 | errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL); |
---|
767 | if (FSE_isError(errorCode)) { |
---|
768 | eSize = ERROR(GENERIC); |
---|
769 | DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount"); |
---|
770 | goto _cleanup; |
---|
771 | } |
---|
772 | llLog = (U32)errorCode; |
---|
773 | |
---|
774 | |
---|
775 | /* write result to buffer */ |
---|
776 | { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog); |
---|
777 | if (HUF_isError(hhSize)) { |
---|
778 | eSize = ERROR(GENERIC); |
---|
779 | DISPLAYLEVEL(1, "HUF_writeCTable error"); |
---|
780 | goto _cleanup; |
---|
781 | } |
---|
782 | dstPtr += hhSize; |
---|
783 | maxDstSize -= hhSize; |
---|
784 | eSize += hhSize; |
---|
785 | } |
---|
786 | |
---|
787 | { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); |
---|
788 | if (FSE_isError(ohSize)) { |
---|
789 | eSize = ERROR(GENERIC); |
---|
790 | DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount"); |
---|
791 | goto _cleanup; |
---|
792 | } |
---|
793 | dstPtr += ohSize; |
---|
794 | maxDstSize -= ohSize; |
---|
795 | eSize += ohSize; |
---|
796 | } |
---|
797 | |
---|
798 | { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog); |
---|
799 | if (FSE_isError(mhSize)) { |
---|
800 | eSize = ERROR(GENERIC); |
---|
801 | DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount"); |
---|
802 | goto _cleanup; |
---|
803 | } |
---|
804 | dstPtr += mhSize; |
---|
805 | maxDstSize -= mhSize; |
---|
806 | eSize += mhSize; |
---|
807 | } |
---|
808 | |
---|
809 | { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog); |
---|
810 | if (FSE_isError(lhSize)) { |
---|
811 | eSize = ERROR(GENERIC); |
---|
812 | DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount"); |
---|
813 | goto _cleanup; |
---|
814 | } |
---|
815 | dstPtr += lhSize; |
---|
816 | maxDstSize -= lhSize; |
---|
817 | eSize += lhSize; |
---|
818 | } |
---|
819 | |
---|
820 | if (maxDstSize<12) { |
---|
821 | eSize = ERROR(GENERIC); |
---|
822 | DISPLAYLEVEL(1, "not enough space to write RepOffsets"); |
---|
823 | goto _cleanup; |
---|
824 | } |
---|
825 | # if 0 |
---|
826 | MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset); |
---|
827 | MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset); |
---|
828 | MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset); |
---|
829 | #else |
---|
830 | /* at this stage, we don't use the result of "most common first offset", |
---|
831 | as the impact of statistics is not properly evaluated */ |
---|
832 | MEM_writeLE32(dstPtr+0, repStartValue[0]); |
---|
833 | MEM_writeLE32(dstPtr+4, repStartValue[1]); |
---|
834 | MEM_writeLE32(dstPtr+8, repStartValue[2]); |
---|
835 | #endif |
---|
836 | dstPtr += 12; |
---|
837 | eSize += 12; |
---|
838 | |
---|
839 | _cleanup: |
---|
840 | ZSTD_freeCCtx(esr.ref); |
---|
841 | ZSTD_freeCCtx(esr.zc); |
---|
842 | free(esr.workPlace); |
---|
843 | |
---|
844 | return eSize; |
---|
845 | } |
---|
846 | |
---|
847 | |
---|
848 | #define DIB_FASTSEGMENTSIZE 64 |
---|
849 | /*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) : |
---|
850 | Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`, |
---|
851 | up to `dictSize`. |
---|
852 | Filling starts from the end of `dictBuffer`, down to maximum possible. |
---|
853 | if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used. |
---|
854 | @return : amount of data written into `dictBuffer`, |
---|
855 | or an error code |
---|
856 | */ |
---|
857 | static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize, |
---|
858 | const void* samplesBuffer, size_t samplesSize) |
---|
859 | { |
---|
860 | char* dstPtr = (char*)dictBuffer + dictSize; |
---|
861 | const char* srcPtr = (const char*)samplesBuffer; |
---|
862 | size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE; |
---|
863 | size_t segNb, interSize; |
---|
864 | |
---|
865 | if (nbSegments <= 2) return ERROR(srcSize_wrong); |
---|
866 | if (samplesSize < dictSize) return ERROR(srcSize_wrong); |
---|
867 | |
---|
868 | /* first and last segments are part of dictionary, in case they contain interesting header/footer */ |
---|
869 | dstPtr -= DIB_FASTSEGMENTSIZE; |
---|
870 | memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE); |
---|
871 | dstPtr -= DIB_FASTSEGMENTSIZE; |
---|
872 | memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE); |
---|
873 | |
---|
874 | /* regularly copy a segment */ |
---|
875 | interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1); |
---|
876 | srcPtr += DIB_FASTSEGMENTSIZE; |
---|
877 | for (segNb=2; segNb < nbSegments; segNb++) { |
---|
878 | srcPtr += interSize; |
---|
879 | dstPtr -= DIB_FASTSEGMENTSIZE; |
---|
880 | memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE); |
---|
881 | srcPtr += DIB_FASTSEGMENTSIZE; |
---|
882 | } |
---|
883 | |
---|
884 | return nbSegments * DIB_FASTSEGMENTSIZE; |
---|
885 | } |
---|
886 | |
---|
887 | size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
---|
888 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
---|
889 | ZDICT_params_t params) |
---|
890 | { |
---|
891 | size_t hSize; |
---|
892 | unsigned const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel; |
---|
893 | |
---|
894 | /* dictionary header */ |
---|
895 | MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); |
---|
896 | { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); |
---|
897 | U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; |
---|
898 | U32 const dictID = params.dictID ? params.dictID : compliantID; |
---|
899 | MEM_writeLE32((char*)dictBuffer+4, dictID); |
---|
900 | } |
---|
901 | hSize = 8; |
---|
902 | |
---|
903 | /* entropy tables */ |
---|
904 | DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ |
---|
905 | DISPLAYLEVEL(2, "statistics ... \n"); |
---|
906 | hSize += ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, |
---|
907 | compressionLevel, |
---|
908 | samplesBuffer, samplesSizes, nbSamples, |
---|
909 | (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); |
---|
910 | |
---|
911 | if (hSize + dictContentSize < dictBufferCapacity) |
---|
912 | memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); |
---|
913 | return MIN(dictBufferCapacity, hSize+dictContentSize); |
---|
914 | } |
---|
915 | |
---|
916 | |
---|
917 | #define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3) |
---|
918 | /*! ZDICT_trainFromBuffer_unsafe() : |
---|
919 | * `samplesBuffer` must be followed by noisy guard band. |
---|
920 | * @return : size of dictionary. |
---|
921 | */ |
---|
922 | size_t ZDICT_trainFromBuffer_unsafe( |
---|
923 | void* dictBuffer, size_t maxDictSize, |
---|
924 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
---|
925 | ZDICT_params_t params) |
---|
926 | { |
---|
927 | U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16)); |
---|
928 | dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); |
---|
929 | unsigned selectivity = params.selectivityLevel; |
---|
930 | size_t const targetDictSize = maxDictSize; |
---|
931 | size_t sBuffSize; |
---|
932 | size_t dictSize = 0; |
---|
933 | |
---|
934 | /* checks */ |
---|
935 | if (!dictList) return ERROR(memory_allocation); |
---|
936 | if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } |
---|
937 | |
---|
938 | /* init */ |
---|
939 | { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; } |
---|
940 | if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */ |
---|
941 | ZDICT_initDictItem(dictList); |
---|
942 | g_displayLevel = params.notificationLevel; |
---|
943 | if (selectivity==0) selectivity = g_selectivity_default; |
---|
944 | |
---|
945 | /* build dictionary */ |
---|
946 | if (selectivity>1) { /* selectivity == 1 => fast mode */ |
---|
947 | ZDICT_trainBuffer(dictList, dictListSize, |
---|
948 | samplesBuffer, sBuffSize, |
---|
949 | samplesSizes, nbSamples, |
---|
950 | selectivity, (U32)targetDictSize); |
---|
951 | |
---|
952 | /* display best matches */ |
---|
953 | if (g_displayLevel>= 3) { |
---|
954 | U32 const nb = 25; |
---|
955 | U32 const dictContentSize = ZDICT_dictSize(dictList); |
---|
956 | U32 u; |
---|
957 | DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); |
---|
958 | DISPLAYLEVEL(3, "list %u best segments \n", nb); |
---|
959 | for (u=1; u<=nb; u++) { |
---|
960 | U32 p = dictList[u].pos; |
---|
961 | U32 l = dictList[u].length; |
---|
962 | U32 d = MIN(40, l); |
---|
963 | DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", |
---|
964 | u, l, p, dictList[u].savings); |
---|
965 | ZDICT_printHex(3, (const char*)samplesBuffer+p, d); |
---|
966 | DISPLAYLEVEL(3, "| \n"); |
---|
967 | } } } |
---|
968 | |
---|
969 | /* create dictionary */ |
---|
970 | { U32 dictContentSize = ZDICT_dictSize(dictList); |
---|
971 | |
---|
972 | /* build dict content */ |
---|
973 | { U32 u; |
---|
974 | BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; |
---|
975 | for (u=1; u<dictList->pos; u++) { |
---|
976 | U32 l = dictList[u].length; |
---|
977 | ptr -= l; |
---|
978 | if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */ |
---|
979 | memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); |
---|
980 | } } |
---|
981 | |
---|
982 | /* fast mode dict content */ |
---|
983 | if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */ |
---|
984 | DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */ |
---|
985 | DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10)); |
---|
986 | dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize, |
---|
987 | samplesBuffer, sBuffSize); |
---|
988 | } |
---|
989 | |
---|
990 | dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, |
---|
991 | samplesBuffer, samplesSizes, nbSamples, |
---|
992 | params); |
---|
993 | } |
---|
994 | |
---|
995 | /* clean up */ |
---|
996 | free(dictList); |
---|
997 | return dictSize; |
---|
998 | } |
---|
999 | |
---|
1000 | |
---|
1001 | /* issue : samplesBuffer need to be followed by a noisy guard band. |
---|
1002 | * work around : duplicate the buffer, and add the noise */ |
---|
1003 | size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, |
---|
1004 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
---|
1005 | ZDICT_params_t params) |
---|
1006 | { |
---|
1007 | void* newBuff; |
---|
1008 | size_t sBuffSize; |
---|
1009 | |
---|
1010 | { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; } |
---|
1011 | if (sBuffSize==0) return 0; /* empty content => no dictionary */ |
---|
1012 | newBuff = malloc(sBuffSize + NOISELENGTH); |
---|
1013 | if (!newBuff) return ERROR(memory_allocation); |
---|
1014 | |
---|
1015 | memcpy(newBuff, samplesBuffer, sBuffSize); |
---|
1016 | ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ |
---|
1017 | |
---|
1018 | { size_t const result = ZDICT_trainFromBuffer_unsafe( |
---|
1019 | dictBuffer, dictBufferCapacity, |
---|
1020 | newBuff, samplesSizes, nbSamples, |
---|
1021 | params); |
---|
1022 | free(newBuff); |
---|
1023 | return result; } |
---|
1024 | } |
---|
1025 | |
---|
1026 | |
---|
1027 | size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, |
---|
1028 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) |
---|
1029 | { |
---|
1030 | ZDICT_params_t params; |
---|
1031 | memset(¶ms, 0, sizeof(params)); |
---|
1032 | return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity, |
---|
1033 | samplesBuffer, samplesSizes, nbSamples, |
---|
1034 | params); |
---|
1035 | } |
---|
1036 | |
---|
1037 | size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
---|
1038 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) |
---|
1039 | { |
---|
1040 | ZDICT_params_t params; |
---|
1041 | memset(¶ms, 0, sizeof(params)); |
---|
1042 | return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, |
---|
1043 | samplesBuffer, samplesSizes, nbSamples, |
---|
1044 | params); |
---|
1045 | } |
---|