source: thirdparty/blosc/shuffle.c @ 981e22c

Revision 981e22c, 16.1 KB checked in by Hal Finkel <hfinkel@…>, 8 years ago (diff)

Upgrade to latest blosc library

blosc git: e394f327ccc78319d90a06af0b88bce07034b8dd

  • Property mode set to 100644
Line 
1/*********************************************************************
2  Blosc - Blocked Shuffling and Compression Library
3
4  Author: Francesc Alted <[email protected]>
5  Creation date: 2009-05-20
6
7  See LICENSES/BLOSC.txt for details about copyright and rights to use.
8**********************************************************************/
9
10#include "shuffle.h"
11#include "shuffle-common.h"
12#include "shuffle-generic.h"
13#include "bitshuffle-generic.h"
14#include <stdio.h>
15#include <string.h>
16
17/* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
18#if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
19/* have a C99 compiler */
20typedef _Bool bool;
21#else
22/* do not have a C99 compiler */
23typedef unsigned char bool;
24#endif
25static const bool false = 0;
26static const bool true = 1;
27
28
29#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
30    __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
31#define HAVE_CPU_FEAT_INTRIN
32#endif
33
34/*  Include hardware-accelerated shuffle/unshuffle routines based on
35    the target architecture. Note that a target architecture may support
36    more than one type of acceleration!*/
37#if defined(SHUFFLE_AVX2_ENABLED)
38  #include "shuffle-avx2.h"
39  #include "bitshuffle-avx2.h"
40#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
41
42#if defined(SHUFFLE_SSE2_ENABLED)
43  #include "shuffle-sse2.h"
44  #include "bitshuffle-sse2.h"
45#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
46
47
48/*  Define function pointer types for shuffle/unshuffle routines. */
49typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
50typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
51typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
52typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
53
54/* An implementation of shuffle/unshuffle routines. */
55typedef struct shuffle_implementation {
56  /* Name of this implementation. */
57  const char* name;
58  /* Function pointer to the shuffle routine for this implementation. */
59  shuffle_func shuffle;
60  /* Function pointer to the unshuffle routine for this implementation. */
61  unshuffle_func unshuffle;
62  /* Function pointer to the bitshuffle routine for this implementation. */
63  bitshuffle_func bitshuffle;
64  /* Function pointer to the bitunshuffle routine for this implementation. */
65  bitunshuffle_func bitunshuffle;
66} shuffle_implementation_t;
67
68typedef enum {
69  BLOSC_HAVE_NOTHING = 0,
70  BLOSC_HAVE_SSE2 = 1,
71  BLOSC_HAVE_AVX2 = 2
72} blosc_cpu_features;
73
74/*  Detect hardware and set function pointers to the best shuffle/unshuffle
75    implementations supported by the host processor. */
76#if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)    /* Intel/i686 */
77
78/*  Disabled the __builtin_cpu_supports() call, as it has issues with
79    new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
80      "undefined symbol: __cpu_model"
81    For a similar report, see:
82    https://lists.fedoraproject.org/archives/list/[email protected]/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
83*/
84#if defined(HAVE_CPU_FEAT_INTRIN) && 0
85static blosc_cpu_features blosc_get_cpu_features(void) {
86  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
87  if (__builtin_cpu_supports("sse2")) {
88    cpu_features |= BLOSC_HAVE_SSE2;
89  }
90  if (__builtin_cpu_supports("avx2")) {
91    cpu_features |= BLOSC_HAVE_AVX2;
92  }
93  return cpu_features;
94}
95#else
96
97#if defined(_MSC_VER) && !defined(__clang__)
98  #include <intrin.h>     /* Needed for __cpuid */
99
100/*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
101#if _MSC_FULL_VER >= 160040219
102  #include <immintrin.h>  /* Needed for _xgetbv */
103#elif defined(_M_IX86)
104
105/*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
106
107static uint64_t _xgetbv(uint32_t xcr) {
108    uint32_t xcr0, xcr1;
109    __asm {
110        mov        ecx, xcr
111        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
112        mov        xcr0, eax
113        mov        xcr1, edx
114    }
115    return ((uint64_t)xcr1 << 32) | xcr0;
116}
117
118#elif defined(_M_X64)
119
120/*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
121    These compilers don't support any of the newer acceleration ISAs
122    (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
123    which means we can get away with returning a hard-coded value from
124    this implementation of _xgetbv. */
125
126static inline uint64_t
127_xgetbv(uint32_t xcr) {
128    /* A 64-bit OS must have XMM save support. */
129    return xcr == 0 ? (1UL << 1) : 0UL;
130}
131
132#else
133
134/* Hardware detection for any other MSVC targets (e.g., ARM)
135   isn't implemented at this time. */
136#error This version of c-blosc only supports x86 and x64 targets with MSVC.
137
138#endif /* _MSC_FULL_VER >= 160040219 */
139 
140#else
141
142/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
143    and others using inline assembly. */
144__attribute__((always_inline))
145static inline void
146__cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
147  __asm__ __volatile__ (
148# if defined(__i386__) && defined (__PIC__)
149  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
150      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
151  */
152    "movl %%ebx, %%edi\n\t"
153    "cpuid\n\t"
154    "xchgl %%ebx, %%edi":
155    "=D" (cpuInfo[1]),
156#else
157    "cpuid":
158    "=b" (cpuInfo[1]),
159#endif  /* defined(__i386) && defined(__PIC__) */
160    "=a" (cpuInfo[0]),
161    "=c" (cpuInfo[2]),
162    "=d" (cpuInfo[3]) :
163    "a" (function_id), "c" (subfunction_id)
164    );
165}
166
167#define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
168
169#define _XCR_XFEATURE_ENABLED_MASK 0
170
171/* Reads the content of an extended control register.
172   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
173*/
174static inline uint64_t
175_xgetbv(uint32_t xcr) {
176  uint32_t eax, edx;
177  __asm__ __volatile__ (
178    /* "xgetbv"
179       This is specified as raw instruction bytes due to some older compilers
180       having issues with the mnemonic form.
181    */
182    ".byte 0x0f, 0x01, 0xd0":
183    "=a" (eax),
184    "=d" (edx) :
185    "c" (xcr)
186    );
187  return ((uint64_t)edx << 32) | eax;
188}
189
190#endif /* defined(_MSC_FULL_VER) */
191
192#ifndef _XCR_XFEATURE_ENABLED_MASK
193#define _XCR_XFEATURE_ENABLED_MASK 0x0
194#endif
195
196static blosc_cpu_features blosc_get_cpu_features(void) {
197  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
198  int32_t max_basic_function_id;
199  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
200  int32_t cpu_info[4];
201  int sse2_available;
202  int sse3_available;
203  int ssse3_available;
204  int sse41_available;
205  int sse42_available;
206  int xsave_available;
207  int xsave_enabled_by_os;
208  int avx2_available = 0;
209  int avx512bw_available = 0;
210  int xmm_state_enabled = 0;
211  int ymm_state_enabled = 0;
212  int zmm_state_enabled = 0;
213  uint64_t xcr0_contents;
214
215  /* Get the number of basic functions available. */
216  __cpuid(cpu_info, 0);
217  max_basic_function_id = cpu_info[0];
218
219  /* Check for SSE-based features and required OS support */
220  __cpuid(cpu_info, 1);
221  sse2_available = (cpu_info[3] & (1 << 26)) != 0;
222  sse3_available = (cpu_info[2] & (1 << 0)) != 0;
223  ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
224  sse41_available = (cpu_info[2] & (1 << 19)) != 0;
225  sse42_available = (cpu_info[2] & (1 << 20)) != 0;
226
227  xsave_available = (cpu_info[2] & (1 << 26)) != 0;
228  xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
229
230  /* Check for AVX-based features, if the processor supports extended features. */
231  if (max_basic_function_id >= 7) {
232    __cpuid(cpu_info, 7);
233    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
234    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
235  }
236
237  /*  Even if certain features are supported by the CPU, they may not be supported
238      by the OS (in which case using them would crash the process or system).
239      If xsave is available and enabled by the OS, check the contents of the
240      extended control register XCR0 to see if the CPU features are enabled. */
241#if defined(_XCR_XFEATURE_ENABLED_MASK)
242  if (xsave_available && xsave_enabled_by_os && (
243      sse2_available || sse3_available || ssse3_available
244      || sse41_available || sse42_available
245      || avx2_available || avx512bw_available)) {
246    /* Determine which register states can be restored by the OS. */
247    xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
248
249    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
250    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
251
252    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
253        restored as well as all of zmm16-zmm31 and the opmask registers. */
254    zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
255  }
256#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
257
258#if defined(BLOSC_DUMP_CPU_INFO)
259  printf("Shuffle CPU Information:\n");
260  printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
261  printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
262  printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
263  printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
264  printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
265  printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
266  printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
267  printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
268  printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
269  printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
270  printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
271  printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
272#endif /* defined(BLOSC_DUMP_CPU_INFO) */
273
274  /* Using the gathered CPU information, determine which implementation to use. */
275  /* technically could fail on sse2 cpu on os without xmm support, but that
276   * shouldn't exist anymore */
277  if (sse2_available) {
278    result |= BLOSC_HAVE_SSE2;
279  }
280  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
281    result |= BLOSC_HAVE_AVX2;
282  }
283  return result;
284}
285#endif
286
287#else   /* No hardware acceleration supported for the target architecture. */
288  #if defined(_MSC_VER)
289  #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
290  #else
291  #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
292  #endif
293
294static blosc_cpu_features blosc_get_cpu_features(void) {
295  return BLOSC_HAVE_NOTHING;
296}
297
298#endif
299
300static shuffle_implementation_t get_shuffle_implementation() {
301  blosc_cpu_features cpu_features = blosc_get_cpu_features();
302  shuffle_implementation_t impl_generic;
303
304#if defined(SHUFFLE_AVX2_ENABLED)
305  if (cpu_features & BLOSC_HAVE_AVX2) {
306    shuffle_implementation_t impl_avx2;
307    impl_avx2.name = "avx2";
308    impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
309    impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
310    impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2;
311    impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2;
312    return impl_avx2;
313  }
314#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
315
316#if defined(SHUFFLE_SSE2_ENABLED)
317  if (cpu_features & BLOSC_HAVE_SSE2) {
318    shuffle_implementation_t impl_sse2;
319    impl_sse2.name = "sse2";
320    impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
321    impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
322    impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2;
323    impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2;
324    return impl_sse2;
325  }
326#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
327
328  /*  Processor doesn't support any of the hardware-accelerated implementations,
329      so use the generic implementation. */
330  impl_generic.name = "generic";
331  impl_generic.shuffle = (shuffle_func)shuffle_generic;
332  impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
333  impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
334  impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
335  return impl_generic;
336}
337
338
339/*  Flag indicating whether the implementation has been initialized.
340    Zero means it hasn't been initialized, non-zero means it has. */
341static int32_t implementation_initialized;
342
343/*  The dynamically-chosen shuffle/unshuffle implementation.
344    This is only safe to use once `implementation_initialized` is set. */
345static shuffle_implementation_t host_implementation;
346
347/*  Initialize the shuffle implementation, if necessary. */
348#if defined(__GNUC__) || defined(__clang__)
349__attribute__((always_inline))
350#endif
351static
352#if defined(_MSC_VER)
353__forceinline
354#else
355inline
356#endif
357void init_shuffle_implementation() {
358  /* Initialization could (in rare cases) take place concurrently on
359     multiple threads, but it shouldn't matter because the
360     initialization should return the same result on each thread (so
361     the implementation will be the same). Since that's the case we
362     can avoid complicated synchronization here and get a small
363     performance benefit because we don't need to perform a volatile
364     load on the initialization variable each time this function is
365     called. */
366#if defined(__GNUC__) || defined(__clang__)
367  if (__builtin_expect(!implementation_initialized, 0)) {
368#else
369  if (!implementation_initialized) {
370#endif
371    /* Initialize the implementation. */
372    host_implementation = get_shuffle_implementation();
373
374    /*  Set the flag indicating the implementation has been initialized. */
375    implementation_initialized = 1;
376  }
377}
378
379/*  Shuffle a block by dynamically dispatching to the appropriate
380    hardware-accelerated routine at run-time. */
381void
382shuffle(const size_t bytesoftype, const size_t blocksize,
383        const uint8_t* _src, const uint8_t* _dest) {
384  /* Initialize the shuffle implementation if necessary. */
385  init_shuffle_implementation();
386
387  /*  The implementation is initialized.
388      Dispatch to it's shuffle routine. */
389  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
390}
391
392/*  Unshuffle a block by dynamically dispatching to the appropriate
393    hardware-accelerated routine at run-time. */
394void
395unshuffle(const size_t bytesoftype, const size_t blocksize,
396          const uint8_t* _src, const uint8_t* _dest) {
397  /* Initialize the shuffle implementation if necessary. */
398  init_shuffle_implementation();
399
400  /*  The implementation is initialized.
401      Dispatch to it's unshuffle routine. */
402  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
403}
404
405/*  Bit-shuffle a block by dynamically dispatching to the appropriate
406    hardware-accelerated routine at run-time. */
407int
408bitshuffle(const size_t bytesoftype, const size_t blocksize,
409           const uint8_t* const _src, const uint8_t* _dest,
410           const uint8_t* _tmp) {
411  int size = blocksize / bytesoftype;
412  /* Initialize the shuffle implementation if necessary. */
413  init_shuffle_implementation();
414
415  if ((size % 8) == 0)
416    /* The number of elems is a multiple of 8 which is supported by
417       bitshuffle. */
418    return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest,
419                                                 blocksize / bytesoftype,
420                                                 bytesoftype, (void*)_tmp);
421  else
422    memcpy((void*)_dest, (void*)_src, blocksize);
423  return size;
424}
425
426/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
427    hardware-accelerated routine at run-time. */
428int
429bitunshuffle(const size_t bytesoftype, const size_t blocksize,
430             const uint8_t* const _src, const uint8_t* _dest,
431             const uint8_t* _tmp) {
432  int size = blocksize / bytesoftype;
433  /* Initialize the shuffle implementation if necessary. */
434  init_shuffle_implementation();
435
436  if ((size % 8) == 0)
437    /* The number of elems is a multiple of 8 which is supported by
438       bitshuffle. */
439    return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest,
440                                                   blocksize / bytesoftype,
441                                                   bytesoftype, (void*)_tmp);
442  else
443    memcpy((void*)_dest, (void*)_src, blocksize);
444  return size;
445}
Note: See TracBrowser for help on using the repository browser.