Context Navigation

source: thirdparty/blosc/internal-complibs/zstd-0.7.4/dictBuilder/zdict.c @ 8ebc79b

Revision 8ebc79b, 38.0 KB checked in by Hal Finkel <hfinkel@…>, 8 years ago (diff)
Add the other internal compression libraries from blocs
Property mode set to `100644`

Line
1	/*
2	dictBuilder - dictionary builder for zstd
3	Copyright (C) Yann Collet 2016
4
5	BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6
7	Redistribution and use in source and binary forms, with or without
8	modification, are permitted provided that the following conditions are
9	met:
10
11	* Redistributions of source code must retain the above copyright
12	notice, this list of conditions and the following disclaimer.
13	* Redistributions in binary form must reproduce the above
14	copyright notice, this list of conditions and the following disclaimer
15	in the documentation and/or other materials provided with the
16	distribution.
17
18	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30	You can contact the author at :
31	- Zstd homepage : https://www.zstd.net
32	*/
33
34	/-*************************************
35	* Tuning parameters
36	****************************************/
37	#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
38
39
40	/-*************************************
41	* Compiler Options
42	****************************************/
43	/* Unix Large Files support (>4GB) */
44	#define _FILE_OFFSET_BITS 64
45	#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
46	# define _LARGEFILE_SOURCE
47	#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
48	# define _LARGEFILE64_SOURCE
49	#endif
50
51
52	/-************************************
53	* Dependencies
54	***************************************/
55	#include <stdlib.h> /* malloc, free */
56	#include <string.h> /* memset */
57	#include <stdio.h> /* fprintf, fopen, ftello64 */
58	#include <time.h> /* clock */
59
60	#include "mem.h" /* read */
61	#include "error_private.h"
62	#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
63	#define HUF_STATIC_LINKING_ONLY
64	#include "huf.h"
65	#include "zstd_internal.h" /* includes zstd.h */
66	#include "xxhash.h"
67	#include "divsufsort.h"
68	#ifndef ZDICT_STATIC_LINKING_ONLY
69	# define ZDICT_STATIC_LINKING_ONLY
70	#endif
71	#include "zdict.h"
72
73
74	/-************************************
75	* Constants
76	***************************************/
77	#define KB *(1 <<10)
78	#define MB *(1 <<20)
79	#define GB *(1U<<30)
80
81	#define DICTLISTSIZE 10000
82
83	#define NOISELENGTH 32
84	#define PRIME1 2654435761U
85	#define PRIME2 2246822519U
86
87	#define MINRATIO 4
88	static const U32 g_compressionLevel_default = 5;
89	static const U32 g_selectivity_default = 9;
90	static const size_t g_provision_entropySize = 200;
91	static const size_t g_min_fast_dictContent = 192;
92
93
94	/-************************************
95	* Console display
96	***************************************/
97	#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
98	#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
99	static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
100
101	#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
102	if (ZDICT_clockSpan(g_time) > refreshRate) \
103	{ g_time = clock(); DISPLAY(__VA_ARGS__); \
104	if (g_displayLevel>=4) fflush(stdout); } }
105	static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
106	static clock_t g_time = 0;
107
108	static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
109
110	static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
111	{
112	const BYTE* const b = (const BYTE*)ptr;
113	size_t u;
114	for (u=0; u<length; u++) {
115	BYTE c = b[u];
116	if (c<32 \|\| c>126) c = '.'; /* non-printable char */
117	DISPLAYLEVEL(dlevel, "%c", c);
118	}
119	}
120
121
122	/-*******************************************************
123	* Helper functions
124	**********************************************************/
125	unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
126
127	const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
128
129
130	/-*******************************************************
131	* Dictionary training functions
132	**********************************************************/
133	static unsigned ZDICT_NbCommonBytes (register size_t val)
134	{
135	if (MEM_isLittleEndian()) {
136	if (MEM_64bits()) {
137	# if defined(_MSC_VER) && defined(_WIN64)
138	unsigned long r = 0;
139	_BitScanForward64( &r, (U64)val );
140	return (unsigned)(r>>3);
141	# elif defined(__GNUC__) && (__GNUC__ >= 3)
142	return (__builtin_ctzll((U64)val) >> 3);
143	# else
144	static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
145	return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
146	# endif
147	} else { /* 32 bits */
148	# if defined(_MSC_VER)
149	unsigned long r=0;
150	_BitScanForward( &r, (U32)val );
151	return (unsigned)(r>>3);
152	# elif defined(__GNUC__) && (__GNUC__ >= 3)
153	return (__builtin_ctz((U32)val) >> 3);
154	# else
155	static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
156	return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
157	# endif
158	}
159	} else { /* Big Endian CPU */
160	if (MEM_64bits()) {
161	# if defined(_MSC_VER) && defined(_WIN64)
162	unsigned long r = 0;
163	_BitScanReverse64( &r, val );
164	return (unsigned)(r>>3);
165	# elif defined(__GNUC__) && (__GNUC__ >= 3)
166	return (__builtin_clzll(val) >> 3);
167	# else
168	unsigned r;
169	const unsigned n32 = sizeof(size_t)4; / calculate this way due to compiler complaining in 32-bits mode */
170	if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
171	if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
172	r += (!val);
173	return r;
174	# endif
175	} else { /* 32 bits */
176	# if defined(_MSC_VER)
177	unsigned long r = 0;
178	_BitScanReverse( &r, (unsigned long)val );
179	return (unsigned)(r>>3);
180	# elif defined(__GNUC__) && (__GNUC__ >= 3)
181	return (__builtin_clz((U32)val) >> 3);
182	# else
183	unsigned r;
184	if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
185	r += (!val);
186	return r;
187	# endif
188	} }
189	}
190
191
192	/*! ZDICT_count() :
193	Count the nb of common bytes between 2 pointers.
194	Note : this function presumes end of buffer followed by noisy guard band.
195	*/
196	static size_t ZDICT_count(const void* pIn, const void* pMatch)
197	{
198	const char* const pStart = (const char*)pIn;
199	for (;;) {
200	size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
201	if (!diff) {
202	pIn = (const char*)pIn+sizeof(size_t);
203	pMatch = (const char*)pMatch+sizeof(size_t);
204	continue;
205	}
206	pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
207	return (size_t)((const char*)pIn - pStart);
208	}
209	}
210
211
212	typedef struct {
213	U32 pos;
214	U32 length;
215	U32 savings;
216	} dictItem;
217
218	static void ZDICT_initDictItem(dictItem* d)
219	{
220	d->pos = 1;
221	d->length = 0;
222	d->savings = (U32)(-1);
223	}
224
225
226	#define LLIMIT 64 /* heuristic determined experimentally */
227	#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
228	static dictItem ZDICT_analyzePos(
229	BYTE* doneMarks,
230	const int* suffix, U32 start,
231	const void* buffer, U32 minRatio)
232	{
233	U32 lengthList[LLIMIT] = {0};
234	U32 cumulLength[LLIMIT] = {0};
235	U32 savings[LLIMIT] = {0};
236	const BYTE* b = (const BYTE*)buffer;
237	size_t length;
238	size_t maxLength = LLIMIT;
239	size_t pos = suffix[start];
240	U32 end = start;
241	dictItem solution;
242
243	/* init */
244	memset(&solution, 0, sizeof(solution));
245	doneMarks[pos] = 1;
246
247	/* trivial repetition cases */
248	if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
249	\|\|(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
250	\|\|(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
251	/* skip and mark segment */
252	U16 u16 = MEM_read16(b+pos+4);
253	U32 u, e = 6;
254	while (MEM_read16(b+pos+e) == u16) e+=2 ;
255	if (b[pos+e] == b[pos+e-1]) e++;
256	for (u=1; u<e; u++)
257	doneMarks[pos+u] = 1;
258	return solution;
259	}
260
261	/* look forward */
262	do {
263	end++;
264	length = ZDICT_count(b + pos, b + suffix[end]);
265	} while (length >=MINMATCHLENGTH);
266
267	/* look backward */
268	do {
269	length = ZDICT_count(b + pos, b + *(suffix+start-1));
270	if (length >=MINMATCHLENGTH) start--;
271	} while(length >= MINMATCHLENGTH);
272
273	/* exit if not found a minimum nb of repetitions */
274	if (end-start < minRatio) {
275	U32 idx;
276	for(idx=start; idx<end; idx++)
277	doneMarks[suffix[idx]] = 1;
278	return solution;
279	}
280
281	{ int i;
282	U32 searchLength;
283	U32 refinedStart = start;
284	U32 refinedEnd = end;
285
286	DISPLAYLEVEL(4, "\n");
287	DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
288	DISPLAYLEVEL(4, "\n");
289
290	for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
291	BYTE currentChar = 0;
292	U32 currentCount = 0;
293	U32 currentID = refinedStart;
294	U32 id;
295	U32 selectedCount = 0;
296	U32 selectedID = currentID;
297	for (id =refinedStart; id < refinedEnd; id++) {
298	if (b[ suffix[id] + searchLength] != currentChar) {
299	if (currentCount > selectedCount) {
300	selectedCount = currentCount;
301	selectedID = currentID;
302	}
303	currentID = id;
304	currentChar = b[ suffix[id] + searchLength];
305	currentCount = 0;
306	}
307	currentCount ++;
308	}
309	if (currentCount > selectedCount) { /* for last */
310	selectedCount = currentCount;
311	selectedID = currentID;
312	}
313
314	if (selectedCount < minRatio)
315	break;
316	refinedStart = selectedID;
317	refinedEnd = refinedStart + selectedCount;
318	}
319
320	/* evaluate gain based on new ref */
321	start = refinedStart;
322	pos = suffix[refinedStart];
323	end = start;
324	memset(lengthList, 0, sizeof(lengthList));
325
326	/* look forward */
327	do {
328	end++;
329	length = ZDICT_count(b + pos, b + suffix[end]);
330	if (length >= LLIMIT) length = LLIMIT-1;
331	lengthList[length]++;
332	} while (length >=MINMATCHLENGTH);
333
334	/* look backward */
335	do {
336	length = ZDICT_count(b + pos, b + suffix[start-1]);
337	if (length >= LLIMIT) length = LLIMIT-1;
338	lengthList[length]++;
339	if (length >=MINMATCHLENGTH) start--;
340	} while(length >= MINMATCHLENGTH);
341
342	/* largest useful length */
343	memset(cumulLength, 0, sizeof(cumulLength));
344	cumulLength[maxLength-1] = lengthList[maxLength-1];
345	for (i=(int)(maxLength-2); i>=0; i--)
346	cumulLength[i] = cumulLength[i+1] + lengthList[i];
347
348	for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
349	maxLength = i;
350
351	/* reduce maxLength in case of final into repetitive data */
352	{ U32 l = (U32)maxLength;
353	BYTE const c = b[pos + maxLength-1];
354	while (b[pos+l-2]==c) l--;
355	maxLength = l;
356	}
357	if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
358
359	/* calculate savings */
360	savings[5] = 0;
361	for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
362	savings[i] = savings[i-1] + (lengthList[i] * (i-3));
363
364	DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
365	(U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
366
367	solution.pos = (U32)pos;
368	solution.length = (U32)maxLength;
369	solution.savings = savings[maxLength];
370
371	/* mark positions done */
372	{ U32 id;
373	for (id=start; id<end; id++) {
374	U32 p, pEnd;
375	U32 const testedPos = suffix[id];
376	if (testedPos == pos)
377	length = solution.length;
378	else {
379	length = ZDICT_count(b+pos, b+testedPos);
380	if (length > solution.length) length = solution.length;
381	}
382	pEnd = (U32)(testedPos + length);
383	for (p=testedPos; p<pEnd; p++)
384	doneMarks[p] = 1;
385	} } }
386
387	return solution;
388	}
389
390
391	/*! ZDICT_checkMerge
392	check if dictItem can be merged, do it if possible
393	@return : id of destination elt, 0 if not merged
394	*/
395	static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
396	{
397	const U32 tableSize = table->pos;
398	const U32 max = elt.pos + (elt.length-1);
399
400	/* tail overlap */
401	U32 u; for (u=1; u<tableSize; u++) {
402	if (u==eltNbToSkip) continue;
403	if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
404	/* append */
405	U32 addedLength = table[u].pos - elt.pos;
406	table[u].length += addedLength;
407	table[u].pos = elt.pos;
408	table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
409	table[u].savings += elt.length / 8; /* rough approx */
410	elt = table[u];
411	while ((u>1) && (table[u-1].savings < elt.savings))
412	table[u] = table[u-1], u--;
413	table[u] = elt;
414	return u;
415	} }
416
417	/* front overlap */
418	for (u=1; u<tableSize; u++) {
419	if (u==eltNbToSkip) continue;
420	if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
421	/* append */
422	int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
423	table[u].savings += elt.length / 8; /* rough approx */
424	if (addedLength > 0) { /* otherwise, already included */
425	table[u].length += addedLength;
426	table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
427	}
428	elt = table[u];
429	while ((u>1) && (table[u-1].savings < elt.savings))
430	table[u] = table[u-1], u--;
431	table[u] = elt;
432	return u;
433	} }
434
435	return 0;
436	}
437
438
439	static void ZDICT_removeDictItem(dictItem* table, U32 id)
440	{
441	/* convention : first element is nb of elts */
442	U32 const max = table->pos;
443	U32 u;
444	if (!id) return; /* protection, should never happen */
445	for (u=id; u<max-1; u++)
446	table[u] = table[u+1];
447	table->pos--;
448	}
449
450
451	static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
452	{
453	/* merge if possible */
454	U32 mergeId = ZDICT_checkMerge(table, elt, 0);
455	if (mergeId) {
456	U32 newMerge = 1;
457	while (newMerge) {
458	newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
459	if (newMerge) ZDICT_removeDictItem(table, mergeId);
460	mergeId = newMerge;
461	}
462	return;
463	}
464
465	/* insert */
466	{ U32 current;
467	U32 nextElt = table->pos;
468	if (nextElt >= maxSize) nextElt = maxSize-1;
469	current = nextElt-1;
470	while (table[current].savings < elt.savings) {
471	table[current+1] = table[current];
472	current--;
473	}
474	table[current+1] = elt;
475	table->pos = nextElt+1;
476	}
477	}
478
479
480	static U32 ZDICT_dictSize(const dictItem* dictList)
481	{
482	U32 u, dictSize = 0;
483	for (u=1; u<dictList[0].pos; u++)
484	dictSize += dictList[u].length;
485	return dictSize;
486	}
487
488
489	static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
490	const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
491	const size_t* fileSizes, unsigned nbFiles,
492	U32 shiftRatio, unsigned maxDictSize)
493	{
494	int* const suffix0 = (int)malloc((bufferSize+2)sizeof(*suffix0));
495	int* const suffix = suffix0+1;
496	U32* reverseSuffix = (U32)malloc((bufferSize)sizeof(*reverseSuffix));
497	BYTE* doneMarks = (BYTE)malloc((bufferSize+16)sizeof(doneMarks)); / +16 for overflow security */
498	U32* filePos = (U32)malloc(nbFiles sizeof(*filePos));
499	U32 minRatio = nbFiles >> shiftRatio;
500	size_t result = 0;
501
502	/* init */
503	DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
504	if (!suffix0 \|\| !reverseSuffix \|\| !doneMarks \|\| !filePos) {
505	result = ERROR(memory_allocation);
506	goto _cleanup;
507	}
508	if (minRatio < MINRATIO) minRatio = MINRATIO;
509	memset(doneMarks, 0, bufferSize+16);
510
511	/* limit sample set size (divsufsort limitation)*/
512	if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
513	while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
514
515	/* sort */
516	DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
517	{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
518	if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
519	}
520	suffix[bufferSize] = (int)bufferSize; /* leads into noise */
521	suffix0[0] = (int)bufferSize; /* leads into noise */
522	/* build reverse suffix sort */
523	{ size_t pos;
524	for (pos=0; pos < bufferSize; pos++)
525	reverseSuffix[suffix[pos]] = (U32)pos;
526	/* build file pos */
527	filePos[0] = 0;
528	for (pos=1; pos<nbFiles; pos++)
529	filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
530	}
531
532	DISPLAYLEVEL(2, "finding patterns ... \n");
533	DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
534
535	{ U32 cursor; for (cursor=0; cursor < bufferSize; ) {
536	dictItem solution;
537	if (doneMarks[cursor]) { cursor++; continue; }
538	solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
539	if (solution.length==0) { cursor++; continue; }
540	ZDICT_insertDictItem(dictList, dictListSize, solution);
541	cursor += solution.length;
542	DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
543	} }
544
545	/* limit dictionary size */
546	{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
547	U32 currentSize = 0;
548	U32 n; for (n=1; n<max; n++) {
549	currentSize += dictList[n].length;
550	if (currentSize > maxDictSize) break;
551	}
552	dictList->pos = n;
553	}
554
555	_cleanup:
556	free(suffix0);
557	free(reverseSuffix);
558	free(doneMarks);
559	free(filePos);
560	return result;
561	}
562
563
564	static void ZDICT_fillNoise(void* buffer, size_t length)
565	{
566	unsigned acc = PRIME1;
567	size_t p=0;;
568	for (p=0; p<length; p++) {
569	acc *= PRIME2;
570	((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
571	}
572	}
573
574
575	typedef struct
576	{
577	ZSTD_CCtx* ref;
578	ZSTD_CCtx* zc;
579	void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
580	} EStats_ress_t;
581
582	#define MAXREPOFFSET 1024
583
584	static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
585	U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
586	const void* src, size_t srcSize)
587	{
588	size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
589	size_t cSize;
590
591	if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
592	{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref);
593	if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
594	}
595	cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
596	if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
597
598	if (cSize) { /* if == 0; block is not compressible */
599	const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
600
601	/* literals stats */
602	{ const BYTE* bytePtr;
603	for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
604	countLit[*bytePtr]++;
605	}
606
607	/* seqStats */
608	{ size_t const nbSeq = (size_t)(seqStorePtr->offset - seqStorePtr->offsetStart);
609	ZSTD_seqToCodes(seqStorePtr, nbSeq);
610
611	{ const BYTE* codePtr = seqStorePtr->offCodeStart;
612	size_t u;
613	for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
614	}
615
616	{ const BYTE* codePtr = seqStorePtr->mlCodeStart;
617	size_t u;
618	for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
619	}
620
621	{ const BYTE* codePtr = seqStorePtr->llCodeStart;
622	size_t u;
623	for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
624	} }
625
626	/* rep offsets */
627	{ const U32* const offsetPtr = seqStorePtr->offsetStart;
628	U32 offset1 = offsetPtr[0] - 3;
629	U32 offset2 = offsetPtr[1] - 3;
630	if (offset1 >= MAXREPOFFSET) offset1 = 0;
631	if (offset2 >= MAXREPOFFSET) offset2 = 0;
632	repOffsets[offset1] += 3;
633	repOffsets[offset2] += 1;
634	}
635	}
636	}
637
638	/*
639	static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
640	{
641	unsigned u;
642	size_t max=0;
643	for (u=0; u<nbFiles; u++)
644	if (max < fileSizes[u]) max = fileSizes[u];
645	return max;
646	}
647	*/
648
649	static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
650	{
651	size_t total=0;
652	unsigned u;
653	for (u=0; u<nbFiles; u++) total += fileSizes[u];
654	return total;
655	}
656
657	typedef struct { U32 offset; U32 count; } offsetCount_t;
658
659	static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
660	{
661	U32 u;
662	table[ZSTD_REP_NUM].offset = val;
663	table[ZSTD_REP_NUM].count = count;
664	for (u=ZSTD_REP_NUM; u>0; u--) {
665	offsetCount_t tmp;
666	if (table[u-1].count >= table[u].count) break;
667	tmp = table[u-1];
668	table[u-1] = table[u];
669	table[u] = tmp;
670	}
671	}
672
673
674	#define OFFCODE_MAX 18 /* only applicable to first block */
675	static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
676	unsigned compressionLevel,
677	const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
678	const void* dictBuffer, size_t dictBufferSize)
679	{
680	U32 countLit[256];
681	HUF_CREATE_STATIC_CTABLE(hufTable, 255);
682	U32 offcodeCount[OFFCODE_MAX+1];
683	short offcodeNCount[OFFCODE_MAX+1];
684	U32 matchLengthCount[MaxML+1];
685	short matchLengthNCount[MaxML+1];
686	U32 litLengthCount[MaxLL+1];
687	short litLengthNCount[MaxLL+1];
688	U32 repOffset[MAXREPOFFSET] = { 0 };
689	offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
690	EStats_ress_t esr;
691	ZSTD_parameters params;
692	U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
693	size_t pos = 0, errorCode;
694	size_t eSize = 0;
695	size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
696	size_t const averageSampleSize = totalSrcSize / nbFiles;
697	BYTE* dstPtr = (BYTE*)dstBuffer;
698
699	/* init */
700	for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
701	for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
702	for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
703	for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
704	repOffset[1] = repOffset[4] = repOffset[8] = 1;
705	memset(bestRepOffset, 0, sizeof(bestRepOffset));
706	esr.ref = ZSTD_createCCtx();
707	esr.zc = ZSTD_createCCtx();
708	esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
709	if (!esr.ref \|\| !esr.zc \|\| !esr.workPlace) {
710	eSize = ERROR(memory_allocation);
711	DISPLAYLEVEL(1, "Not enough memory");
712	goto _cleanup;
713	}
714	if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
715	params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
716	{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
717	if (ZSTD_isError(beginResult)) {
718	eSize = ERROR(GENERIC);
719	DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed ");
720	goto _cleanup;
721	} }
722
723	/* collect stats on all files */
724	for (u=0; u<nbFiles; u++) {
725	ZDICT_countEStats(esr, params,
726	countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
727	(const char*)srcBuffer + pos, fileSizes[u]);
728	pos += fileSizes[u];
729	}
730
731	/* analyze */
732	errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
733	if (HUF_isError(errorCode)) {
734	eSize = ERROR(GENERIC);
735	DISPLAYLEVEL(1, "HUF_buildCTable error");
736	goto _cleanup;
737	}
738	huffLog = (U32)errorCode;
739
740	/* looking for most common first offsets */
741	{ U32 offset;
742	for (offset=1; offset<MAXREPOFFSET; offset++)
743	ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
744	}
745	/* note : the result of this phase should be used to better appreciate the impact on statistics */
746
747	total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
748	errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
749	if (FSE_isError(errorCode)) {
750	eSize = ERROR(GENERIC);
751	DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
752	goto _cleanup;
753	}
754	Offlog = (U32)errorCode;
755
756	total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
757	errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
758	if (FSE_isError(errorCode)) {
759	eSize = ERROR(GENERIC);
760	DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
761	goto _cleanup;
762	}
763	mlLog = (U32)errorCode;
764
765	total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
766	errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
767	if (FSE_isError(errorCode)) {
768	eSize = ERROR(GENERIC);
769	DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
770	goto _cleanup;
771	}
772	llLog = (U32)errorCode;
773
774
775	/* write result to buffer */
776	{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
777	if (HUF_isError(hhSize)) {
778	eSize = ERROR(GENERIC);
779	DISPLAYLEVEL(1, "HUF_writeCTable error");
780	goto _cleanup;
781	}
782	dstPtr += hhSize;
783	maxDstSize -= hhSize;
784	eSize += hhSize;
785	}
786
787	{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
788	if (FSE_isError(ohSize)) {
789	eSize = ERROR(GENERIC);
790	DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
791	goto _cleanup;
792	}
793	dstPtr += ohSize;
794	maxDstSize -= ohSize;
795	eSize += ohSize;
796	}
797
798	{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
799	if (FSE_isError(mhSize)) {
800	eSize = ERROR(GENERIC);
801	DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
802	goto _cleanup;
803	}
804	dstPtr += mhSize;
805	maxDstSize -= mhSize;
806	eSize += mhSize;
807	}
808
809	{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
810	if (FSE_isError(lhSize)) {
811	eSize = ERROR(GENERIC);
812	DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
813	goto _cleanup;
814	}
815	dstPtr += lhSize;
816	maxDstSize -= lhSize;
817	eSize += lhSize;
818	}
819
820	if (maxDstSize<12) {
821	eSize = ERROR(GENERIC);
822	DISPLAYLEVEL(1, "not enough space to write RepOffsets");
823	goto _cleanup;
824	}
825	# if 0
826	MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
827	MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
828	MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
829	#else
830	/* at this stage, we don't use the result of "most common first offset",
831	as the impact of statistics is not properly evaluated */
832	MEM_writeLE32(dstPtr+0, repStartValue[0]);
833	MEM_writeLE32(dstPtr+4, repStartValue[1]);
834	MEM_writeLE32(dstPtr+8, repStartValue[2]);
835	#endif
836	dstPtr += 12;
837	eSize += 12;
838
839	_cleanup:
840	ZSTD_freeCCtx(esr.ref);
841	ZSTD_freeCCtx(esr.zc);
842	free(esr.workPlace);
843
844	return eSize;
845	}
846
847
848	#define DIB_FASTSEGMENTSIZE 64
849	/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
850	Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
851	up to `dictSize`.
852	Filling starts from the end of `dictBuffer`, down to maximum possible.
853	if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
854	@return : amount of data written into `dictBuffer`,
855	or an error code
856	*/
857	static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
858	const void* samplesBuffer, size_t samplesSize)
859	{
860	char* dstPtr = (char*)dictBuffer + dictSize;
861	const char* srcPtr = (const char*)samplesBuffer;
862	size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
863	size_t segNb, interSize;
864
865	if (nbSegments <= 2) return ERROR(srcSize_wrong);
866	if (samplesSize < dictSize) return ERROR(srcSize_wrong);
867
868	/* first and last segments are part of dictionary, in case they contain interesting header/footer */
869	dstPtr -= DIB_FASTSEGMENTSIZE;
870	memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
871	dstPtr -= DIB_FASTSEGMENTSIZE;
872	memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
873
874	/* regularly copy a segment */
875	interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
876	srcPtr += DIB_FASTSEGMENTSIZE;
877	for (segNb=2; segNb < nbSegments; segNb++) {
878	srcPtr += interSize;
879	dstPtr -= DIB_FASTSEGMENTSIZE;
880	memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
881	srcPtr += DIB_FASTSEGMENTSIZE;
882	}
883
884	return nbSegments * DIB_FASTSEGMENTSIZE;
885	}
886
887	size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
888	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
889	ZDICT_params_t params)
890	{
891	size_t hSize;
892	unsigned const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
893
894	/* dictionary header */
895	MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
896	{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
897	U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
898	U32 const dictID = params.dictID ? params.dictID : compliantID;
899	MEM_writeLE32((char*)dictBuffer+4, dictID);
900	}
901	hSize = 8;
902
903	/* entropy tables */
904	DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
905	DISPLAYLEVEL(2, "statistics ... \n");
906	hSize += ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
907	compressionLevel,
908	samplesBuffer, samplesSizes, nbSamples,
909	(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
910
911	if (hSize + dictContentSize < dictBufferCapacity)
912	memmove((char)dictBuffer + hSize, (char)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
913	return MIN(dictBufferCapacity, hSize+dictContentSize);
914	}
915
916
917	#define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
918	/*! ZDICT_trainFromBuffer_unsafe() :
919	* `samplesBuffer` must be followed by noisy guard band.
920	* @return : size of dictionary.
921	*/
922	size_t ZDICT_trainFromBuffer_unsafe(
923	void* dictBuffer, size_t maxDictSize,
924	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
925	ZDICT_params_t params)
926	{
927	U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
928	dictItem* const dictList = (dictItem)malloc(dictListSize sizeof(*dictList));
929	unsigned selectivity = params.selectivityLevel;
930	size_t const targetDictSize = maxDictSize;
931	size_t sBuffSize;
932	size_t dictSize = 0;
933
934	/* checks */
935	if (!dictList) return ERROR(memory_allocation);
936	if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
937
938	/* init */
939	{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
940	if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
941	ZDICT_initDictItem(dictList);
942	g_displayLevel = params.notificationLevel;
943	if (selectivity==0) selectivity = g_selectivity_default;
944
945	/* build dictionary */
946	if (selectivity>1) { /* selectivity == 1 => fast mode */
947	ZDICT_trainBuffer(dictList, dictListSize,
948	samplesBuffer, sBuffSize,
949	samplesSizes, nbSamples,
950	selectivity, (U32)targetDictSize);
951
952	/* display best matches */
953	if (g_displayLevel>= 3) {
954	U32 const nb = 25;
955	U32 const dictContentSize = ZDICT_dictSize(dictList);
956	U32 u;
957	DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
958	DISPLAYLEVEL(3, "list %u best segments \n", nb);
959	for (u=1; u<=nb; u++) {
960	U32 p = dictList[u].pos;
961	U32 l = dictList[u].length;
962	U32 d = MIN(40, l);
963	DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes \|",
964	u, l, p, dictList[u].savings);
965	ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
966	DISPLAYLEVEL(3, "\| \n");
967	} } }
968
969	/* create dictionary */
970	{ U32 dictContentSize = ZDICT_dictSize(dictList);
971
972	/* build dict content */
973	{ U32 u;
974	BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
975	for (u=1; u<dictList->pos; u++) {
976	U32 l = dictList[u].length;
977	ptr -= l;
978	if (ptr<(BYTE)dictBuffer) { free(dictList); return ERROR(GENERIC); } / should not happen */
979	memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
980	} }
981
982	/* fast mode dict content */
983	if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
984	DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
985	DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
986	dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
987	samplesBuffer, sBuffSize);
988	}
989
990	dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
991	samplesBuffer, samplesSizes, nbSamples,
992	params);
993	}
994
995	/* clean up */
996	free(dictList);
997	return dictSize;
998	}
999
1000
1001	/* issue : samplesBuffer need to be followed by a noisy guard band.
1002	* work around : duplicate the buffer, and add the noise */
1003	size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
1004	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1005	ZDICT_params_t params)
1006	{
1007	void* newBuff;
1008	size_t sBuffSize;
1009
1010	{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
1011	if (sBuffSize==0) return 0; /* empty content => no dictionary */
1012	newBuff = malloc(sBuffSize + NOISELENGTH);
1013	if (!newBuff) return ERROR(memory_allocation);
1014
1015	memcpy(newBuff, samplesBuffer, sBuffSize);
1016	ZDICT_fillNoise((char)newBuff + sBuffSize, NOISELENGTH); / guard band, for end of buffer condition */
1017
1018	{ size_t const result = ZDICT_trainFromBuffer_unsafe(
1019	dictBuffer, dictBufferCapacity,
1020	newBuff, samplesSizes, nbSamples,
1021	params);
1022	free(newBuff);
1023	return result; }
1024	}
1025
1026
1027	size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1028	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1029	{
1030	ZDICT_params_t params;
1031	memset(&params, 0, sizeof(params));
1032	return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
1033	samplesBuffer, samplesSizes, nbSamples,
1034	params);
1035	}
1036
1037	size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1038	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1039	{
1040	ZDICT_params_t params;
1041	memset(&params, 0, sizeof(params));
1042	return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1043	samplesBuffer, samplesSizes, nbSamples,
1044	params);
1045	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: