source: thirdparty/blosc/shuffle.c @ 00587dc

Revision 00587dc, 15.5 KB checked in by Hal Finkel <hfinkel@…>, 9 years ago (diff)

Initial Commit (gio-base-20150317)

  • Property mode set to 100644
Line 
1/*********************************************************************
2  Blosc - Blocked Suffling and Compression Library
3
4  Author: Francesc Alted <[email protected]>
5  Creation date: 2009-05-20
6
7  See LICENSES/BLOSC.txt for details about copyright and rights to use.
8**********************************************************************/
9
10#include <stdio.h>
11#include <string.h>
12#include "shuffle.h"
13
14#if defined(_WIN32) && !defined(__MINGW32__)
15  #include <windows.h>
16  #include "win32/stdint-windows.h"
17  #define __SSE2__          /* Windows does not define this by default */
18#else
19  #include <stdint.h>
20  #include <inttypes.h>
21#endif  /* _WIN32 */
22
23
24/* The non-SSE2 versions of shuffle and unshuffle */
25
26/* Shuffle a block.  This can never fail. */
27static void _shuffle(size_t bytesoftype, size_t blocksize,
28                         uint8_t* _src, uint8_t* _dest)
29{
30  size_t i, j, neblock, leftover;
31
32  /* Non-optimized shuffle */
33  neblock = blocksize / bytesoftype;  /* Number of elements in a block */
34  for (j = 0; j < bytesoftype; j++) {
35    for (i = 0; i < neblock; i++) {
36      _dest[j*neblock+i] = _src[i*bytesoftype+j];
37    }
38  }
39  leftover = blocksize % bytesoftype;
40  memcpy(_dest + neblock*bytesoftype, _src + neblock*bytesoftype, leftover);
41}
42
43/* Unshuffle a block.  This can never fail. */
44static void _unshuffle(size_t bytesoftype, size_t blocksize,
45                       uint8_t* _src, uint8_t* _dest)
46{
47  size_t i, j, neblock, leftover;
48
49  /* Non-optimized unshuffle */
50  neblock = blocksize / bytesoftype;  /* Number of elements in a block */
51  for (i = 0; i < neblock; i++) {
52    for (j = 0; j < bytesoftype; j++) {
53      _dest[i*bytesoftype+j] = _src[j*neblock+i];
54    }
55  }
56  leftover = blocksize % bytesoftype;
57  memcpy(_dest+neblock*bytesoftype, _src+neblock*bytesoftype, leftover);
58}
59
60
61#ifdef __SSE2__
62
63/* The SSE2 versions of shuffle and unshuffle */
64
65#include <emmintrin.h>
66
67/* The next is useful for debugging purposes */
68#if 0
69static void printxmm(__m128i xmm0)
70{
71  uint8_t buf[16];
72
73  ((__m128i *)buf)[0] = xmm0;
74  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
75          buf[0], buf[1], buf[2], buf[3],
76          buf[4], buf[5], buf[6], buf[7],
77          buf[8], buf[9], buf[10], buf[11],
78          buf[12], buf[13], buf[14], buf[15]);
79}
80#endif
81
82
83/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
84static void
85shuffle2(uint8_t* dest, uint8_t* src, size_t size)
86{
87  size_t i, j, k;
88  size_t numof16belem;
89  __m128i xmm0[2], xmm1[2];
90
91  numof16belem = size / (16*2);
92  for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) {
93    /* Fetch and transpose bytes, words and double words in groups of
94       32 bytes */
95    for (k = 0; k < 2; k++) {
96      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
97      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
98      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
99      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
100      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
101      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
102      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
103      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
104      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
105      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
106    }
107    /* Transpose quad words */
108    for (k = 0; k < 1; k++) {
109      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
110      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
111    }
112    /* Store the result vectors */
113    for (k = 0; k < 2; k++) {
114      ((__m128i *)dest)[k*numof16belem+i] = xmm1[k];
115    }
116  }
117}
118
119
120/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
121static void
122shuffle4(uint8_t* dest, uint8_t* src, size_t size)
123{
124  size_t i, j, k;
125  size_t numof16belem;
126  __m128i xmm0[4], xmm1[4];
127
128  numof16belem = size / (16*4);
129  for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) {
130    /* Fetch and transpose bytes and words in groups of 64 bytes */
131    for (k = 0; k < 4; k++) {
132      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
133      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
134      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d);
135      xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]);
136      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e);
137      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
138    }
139    /* Transpose double words */
140    for (k = 0; k < 2; k++) {
141      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]);
142      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]);
143    }
144    /* Transpose quad words */
145    for (k = 0; k < 2; k++) {
146      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]);
147      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]);
148    }
149    /* Store the result vectors */
150    for (k = 0; k < 4; k++) {
151      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
152    }
153  }
154}
155
156
157/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
158static void
159shuffle8(uint8_t* dest, uint8_t* src, size_t size)
160{
161  size_t i, j, k, l;
162  size_t numof16belem;
163  __m128i xmm0[8], xmm1[8];
164
165  numof16belem = size / (16*8);
166  for (i = 0, j = 0; i < numof16belem; i++, j += 16*8) {
167    /* Fetch and transpose bytes in groups of 128 bytes */
168    for (k = 0; k < 8; k++) {
169      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
170      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
171      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
172    }
173    /* Transpose words */
174    for (k = 0, l = 0; k < 4; k++, l +=2) {
175      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]);
176      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]);
177    }
178    /* Transpose double words */
179    for (k = 0, l = 0; k < 4; k++, l++) {
180      if (k == 2) l += 2;
181      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]);
182      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]);
183    }
184    /* Transpose quad words */
185    for (k = 0; k < 4; k++) {
186      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]);
187      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]);
188    }
189    /* Store the result vectors */
190    for (k = 0; k < 8; k++) {
191      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
192    }
193  }
194}
195
196
197/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
198static void
199shuffle16(uint8_t* dest, uint8_t* src, size_t size)
200{
201  size_t i, j, k, l;
202  size_t numof16belem;
203  __m128i xmm0[16], xmm1[16];
204
205  numof16belem = size / (16*16);
206  for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) {
207    /* Fetch elements in groups of 256 bytes */
208    for (k = 0; k < 16; k++) {
209      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
210    }
211    /* Transpose bytes */
212    for (k = 0, l = 0; k < 8; k++, l +=2) {
213      xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
214      xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
215    }
216    /* Transpose words */
217    for (k = 0, l = -2; k < 8; k++, l++) {
218      if ((k%2) == 0) l += 2;
219      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
220      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
221    }
222    /* Transpose double words */
223    for (k = 0, l = -4; k < 8; k++, l++) {
224      if ((k%4) == 0) l += 4;
225      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
226      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
227    }
228    /* Transpose quad words */
229    for (k = 0; k < 8; k++) {
230      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
231      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
232    }
233    /* Store the result vectors */
234    for (k = 0; k < 16; k++) {
235      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
236    }
237  }
238}
239
240
241/* Shuffle a block.  This can never fail. */
242void shuffle(size_t bytesoftype, size_t blocksize,
243             uint8_t* _src, uint8_t* _dest) {
244  int unaligned_dest = (int)((uintptr_t)_dest % 16);
245  int power_of_two = (blocksize & (blocksize - 1)) == 0;
246  int too_small = (blocksize < 256);
247
248  if (unaligned_dest || !power_of_two || too_small) {
249    /* _dest buffer is not aligned, not a power of two or is too
250       small.  Call the non-sse2 version. */
251    _shuffle(bytesoftype, blocksize, _src, _dest);
252    return;
253  }
254
255  /* Optimized shuffle */
256  /* The buffer must be aligned on a 16 bytes boundary, have a power */
257  /* of 2 size and be larger or equal than 256 bytes. */
258  if (bytesoftype == 4) {
259    shuffle4(_dest, _src, blocksize);
260  }
261  else if (bytesoftype == 8) {
262    shuffle8(_dest, _src, blocksize);
263  }
264  else if (bytesoftype == 16) {
265    shuffle16(_dest, _src, blocksize);
266  }
267  else if (bytesoftype == 2) {
268    shuffle2(_dest, _src, blocksize);
269  }
270  else {
271    /* Non-optimized shuffle */
272    _shuffle(bytesoftype, blocksize, _src, _dest);
273  }
274}
275
276
277/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
278static void
279unshuffle2(uint8_t* dest, uint8_t* orig, size_t size)
280{
281  size_t i, k;
282  size_t neblock, numof16belem;
283  __m128i xmm1[2], xmm2[2];
284
285  neblock = size / 2;
286  numof16belem = neblock / 16;
287  for (i = 0, k = 0; i < numof16belem; i++, k += 2) {
288    /* Load the first 32 bytes in 2 XMM registrers */
289    xmm1[0] = ((__m128i *)orig)[0*numof16belem+i];
290    xmm1[1] = ((__m128i *)orig)[1*numof16belem+i];
291    /* Shuffle bytes */
292    /* Compute the low 32 bytes */
293    xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]);
294    /* Compute the hi 32 bytes */
295    xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]);
296    /* Store the result vectors in proper order */
297    ((__m128i *)dest)[k+0] = xmm2[0];
298    ((__m128i *)dest)[k+1] = xmm2[1];
299  }
300}
301
302
303/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
304static void
305unshuffle4(uint8_t* dest, uint8_t* orig, size_t size)
306{
307  size_t i, j, k;
308  size_t neblock, numof16belem;
309  __m128i xmm0[4], xmm1[4];
310
311  neblock = size / 4;
312  numof16belem = neblock / 16;
313  for (i = 0, k = 0; i < numof16belem; i++, k += 4) {
314    /* Load the first 64 bytes in 4 XMM registrers */
315    for (j = 0; j < 4; j++) {
316      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
317    }
318    /* Shuffle bytes */
319    for (j = 0; j < 2; j++) {
320      /* Compute the low 32 bytes */
321      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
322      /* Compute the hi 32 bytes */
323      xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
324    }
325    /* Shuffle 2-byte words */
326    for (j = 0; j < 2; j++) {
327      /* Compute the low 32 bytes */
328      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
329      /* Compute the hi 32 bytes */
330      xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
331    }
332    /* Store the result vectors in proper order */
333    ((__m128i *)dest)[k+0] = xmm0[0];
334    ((__m128i *)dest)[k+1] = xmm0[2];
335    ((__m128i *)dest)[k+2] = xmm0[1];
336    ((__m128i *)dest)[k+3] = xmm0[3];
337  }
338}
339
340
341/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
342static void
343unshuffle8(uint8_t* dest, uint8_t* orig, size_t size)
344{
345  size_t i, j, k;
346  size_t neblock, numof16belem;
347  __m128i xmm0[8], xmm1[8];
348
349  neblock = size / 8;
350  numof16belem = neblock / 16;
351  for (i = 0, k = 0; i < numof16belem; i++, k += 8) {
352    /* Load the first 64 bytes in 8 XMM registrers */
353    for (j = 0; j < 8; j++) {
354      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
355    }
356    /* Shuffle bytes */
357    for (j = 0; j < 4; j++) {
358      /* Compute the low 32 bytes */
359      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
360      /* Compute the hi 32 bytes */
361      xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
362    }
363    /* Shuffle 2-byte words */
364    for (j = 0; j < 4; j++) {
365      /* Compute the low 32 bytes */
366      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
367      /* Compute the hi 32 bytes */
368      xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
369    }
370    /* Shuffle 4-byte dwords */
371    for (j = 0; j < 4; j++) {
372      /* Compute the low 32 bytes */
373      xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
374      /* Compute the hi 32 bytes */
375      xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
376    }
377    /* Store the result vectors in proper order */
378    ((__m128i *)dest)[k+0] = xmm1[0];
379    ((__m128i *)dest)[k+1] = xmm1[4];
380    ((__m128i *)dest)[k+2] = xmm1[2];
381    ((__m128i *)dest)[k+3] = xmm1[6];
382    ((__m128i *)dest)[k+4] = xmm1[1];
383    ((__m128i *)dest)[k+5] = xmm1[5];
384    ((__m128i *)dest)[k+6] = xmm1[3];
385    ((__m128i *)dest)[k+7] = xmm1[7];
386  }
387}
388
389
390/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
391static void
392unshuffle16(uint8_t* dest, uint8_t* orig, size_t size)
393{
394  size_t i, j, k;
395  size_t neblock, numof16belem;
396  __m128i xmm1[16], xmm2[16];
397
398  neblock = size / 16;
399  numof16belem = neblock / 16;
400  for (i = 0, k = 0; i < numof16belem; i++, k += 16) {
401    /* Load the first 128 bytes in 16 XMM registrers */
402    for (j = 0; j < 16; j++) {
403      xmm1[j] = ((__m128i *)orig)[j*numof16belem+i];
404    }
405    /* Shuffle bytes */
406    for (j = 0; j < 8; j++) {
407      /* Compute the low 32 bytes */
408      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
409      /* Compute the hi 32 bytes */
410      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
411    }
412    /* Shuffle 2-byte words */
413    for (j = 0; j < 8; j++) {
414      /* Compute the low 32 bytes */
415      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
416      /* Compute the hi 32 bytes */
417      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
418    }
419    /* Shuffle 4-byte dwords */
420    for (j = 0; j < 8; j++) {
421      /* Compute the low 32 bytes */
422      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
423      /* Compute the hi 32 bytes */
424      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
425    }
426    /* Shuffle 8-byte qwords */
427    for (j = 0; j < 8; j++) {
428      /* Compute the low 32 bytes */
429      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
430      /* Compute the hi 32 bytes */
431      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
432    }
433    /* Store the result vectors in proper order */
434    ((__m128i *)dest)[k+0] = xmm1[0];
435    ((__m128i *)dest)[k+1] = xmm1[8];
436    ((__m128i *)dest)[k+2] = xmm1[4];
437    ((__m128i *)dest)[k+3] = xmm1[12];
438    ((__m128i *)dest)[k+4] = xmm1[2];
439    ((__m128i *)dest)[k+5] = xmm1[10];
440    ((__m128i *)dest)[k+6] = xmm1[6];
441    ((__m128i *)dest)[k+7] = xmm1[14];
442    ((__m128i *)dest)[k+8] = xmm1[1];
443    ((__m128i *)dest)[k+9] = xmm1[9];
444    ((__m128i *)dest)[k+10] = xmm1[5];
445    ((__m128i *)dest)[k+11] = xmm1[13];
446    ((__m128i *)dest)[k+12] = xmm1[3];
447    ((__m128i *)dest)[k+13] = xmm1[11];
448    ((__m128i *)dest)[k+14] = xmm1[7];
449    ((__m128i *)dest)[k+15] = xmm1[15];
450  }
451}
452
453
454/* Unshuffle a block.  This can never fail. */
455void unshuffle(size_t bytesoftype, size_t blocksize,
456               uint8_t* _src, uint8_t* _dest) {
457  int unaligned_src = (int)((uintptr_t)_src % 16);
458  int unaligned_dest = (int)((uintptr_t)_dest % 16);
459  int power_of_two = (blocksize & (blocksize - 1)) == 0;
460  int too_small = (blocksize < 256);
461
462  if (unaligned_src || unaligned_dest || !power_of_two || too_small) {
463    /* _src or _dest buffer is not aligned, not a power of two or is
464       too small.  Call the non-sse2 version. */
465    _unshuffle(bytesoftype, blocksize, _src, _dest);
466    return;
467  }
468
469  /* Optimized unshuffle */
470  /* The buffers must be aligned on a 16 bytes boundary, have a power */
471  /* of 2 size and be larger or equal than 256 bytes. */
472  if (bytesoftype == 4) {
473    unshuffle4(_dest, _src, blocksize);
474  }
475  else if (bytesoftype == 8) {
476    unshuffle8(_dest, _src, blocksize);
477  }
478  else if (bytesoftype == 16) {
479    unshuffle16(_dest, _src, blocksize);
480  }
481  else if (bytesoftype == 2) {
482    unshuffle2(_dest, _src, blocksize);
483  }
484  else {
485    /* Non-optimized unshuffle */
486    _unshuffle(bytesoftype, blocksize, _src, _dest);
487  }
488}
489
490#else   /* no __SSE2__ available */
491
492void shuffle(size_t bytesoftype, size_t blocksize,
493             uint8_t* _src, uint8_t* _dest) {
494  _shuffle(bytesoftype, blocksize, _src, _dest);
495}
496
497void unshuffle(size_t bytesoftype, size_t blocksize,
498               uint8_t* _src, uint8_t* _dest) {
499  _unshuffle(bytesoftype, blocksize, _src, _dest);
500}
501
502#endif  /* __SSE2__ */
Note: See TracBrowser for help on using the repository browser.