Ignore:
Timestamp:
08/26/16 19:35:26 (8 years ago)
Author:
Hal Finkel <hfinkel@…>
Branches:
master, pympi
Children:
8ebc79b
Parents:
cda87e9
git-author:
Hal Finkel <hfinkel@…> (08/26/16 19:35:26)
git-committer:
Hal Finkel <hfinkel@…> (08/26/16 19:35:26)
Message:

Upgrade to latest blosc library

blosc git: e394f327ccc78319d90a06af0b88bce07034b8dd

File:
1 edited

Legend:

Unmodified
Added
Removed
  • thirdparty/blosc/shuffle.c

    r00587dc r981e22c  
    11/********************************************************************* 
    2   Blosc - Blocked Suffling and Compression Library 
    3  
    4   Author: Francesc Alted <f[email protected]> 
     2  Blosc - Blocked Shuffling and Compression Library 
     3 
     4  Author: Francesc Alted <f[email protected]> 
    55  Creation date: 2009-05-20 
    66 
     
    88**********************************************************************/ 
    99 
     10#include "shuffle.h" 
     11#include "shuffle-common.h" 
     12#include "shuffle-generic.h" 
     13#include "bitshuffle-generic.h" 
    1014#include <stdio.h> 
    1115#include <string.h> 
    12 #include "shuffle.h" 
    13  
    14 #if defined(_WIN32) && !defined(__MINGW32__) 
    15   #include <windows.h> 
    16   #include "win32/stdint-windows.h" 
    17   #define __SSE2__          /* Windows does not define this by default */ 
    18 #else 
    19   #include <stdint.h> 
    20   #include <inttypes.h> 
    21 #endif  /* _WIN32 */ 
    22  
    23  
    24 /* The non-SSE2 versions of shuffle and unshuffle */ 
    25  
    26 /* Shuffle a block.  This can never fail. */ 
    27 static void _shuffle(size_t bytesoftype, size_t blocksize, 
    28                          uint8_t* _src, uint8_t* _dest) 
    29 { 
    30   size_t i, j, neblock, leftover; 
    31  
    32   /* Non-optimized shuffle */ 
    33   neblock = blocksize / bytesoftype;  /* Number of elements in a block */ 
    34   for (j = 0; j < bytesoftype; j++) { 
    35     for (i = 0; i < neblock; i++) { 
    36       _dest[j*neblock+i] = _src[i*bytesoftype+j]; 
     16 
     17/* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */ 
     18#if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L 
     19/* have a C99 compiler */ 
     20typedef _Bool bool; 
     21#else 
     22/* do not have a C99 compiler */ 
     23typedef unsigned char bool; 
     24#endif 
     25static const bool false = 0; 
     26static const bool true = 1; 
     27 
     28 
     29#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ 
     30    __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) 
     31#define HAVE_CPU_FEAT_INTRIN 
     32#endif 
     33 
     34/*  Include hardware-accelerated shuffle/unshuffle routines based on 
     35    the target architecture. Note that a target architecture may support 
     36    more than one type of acceleration!*/ 
     37#if defined(SHUFFLE_AVX2_ENABLED) 
     38  #include "shuffle-avx2.h" 
     39  #include "bitshuffle-avx2.h" 
     40#endif  /* defined(SHUFFLE_AVX2_ENABLED) */ 
     41 
     42#if defined(SHUFFLE_SSE2_ENABLED) 
     43  #include "shuffle-sse2.h" 
     44  #include "bitshuffle-sse2.h" 
     45#endif  /* defined(SHUFFLE_SSE2_ENABLED) */ 
     46 
     47 
     48/*  Define function pointer types for shuffle/unshuffle routines. */ 
     49typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*); 
     50typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*); 
     51typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*); 
     52typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*); 
     53 
     54/* An implementation of shuffle/unshuffle routines. */ 
     55typedef struct shuffle_implementation { 
     56  /* Name of this implementation. */ 
     57  const char* name; 
     58  /* Function pointer to the shuffle routine for this implementation. */ 
     59  shuffle_func shuffle; 
     60  /* Function pointer to the unshuffle routine for this implementation. */ 
     61  unshuffle_func unshuffle; 
     62  /* Function pointer to the bitshuffle routine for this implementation. */ 
     63  bitshuffle_func bitshuffle; 
     64  /* Function pointer to the bitunshuffle routine for this implementation. */ 
     65  bitunshuffle_func bitunshuffle; 
     66} shuffle_implementation_t; 
     67 
     68typedef enum { 
     69  BLOSC_HAVE_NOTHING = 0, 
     70  BLOSC_HAVE_SSE2 = 1, 
     71  BLOSC_HAVE_AVX2 = 2 
     72} blosc_cpu_features; 
     73 
     74/*  Detect hardware and set function pointers to the best shuffle/unshuffle 
     75    implementations supported by the host processor. */ 
     76#if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)    /* Intel/i686 */ 
     77 
     78/*  Disabled the __builtin_cpu_supports() call, as it has issues with 
     79    new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial: 
     80      "undefined symbol: __cpu_model" 
     81    For a similar report, see: 
     82    https://lists.fedoraproject.org/archives/list/[email protected]/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/ 
     83*/ 
     84#if defined(HAVE_CPU_FEAT_INTRIN) && 0 
     85static blosc_cpu_features blosc_get_cpu_features(void) { 
     86  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING; 
     87  if (__builtin_cpu_supports("sse2")) { 
     88    cpu_features |= BLOSC_HAVE_SSE2; 
     89  } 
     90  if (__builtin_cpu_supports("avx2")) { 
     91    cpu_features |= BLOSC_HAVE_AVX2; 
     92  } 
     93  return cpu_features; 
     94} 
     95#else 
     96 
     97#if defined(_MSC_VER) && !defined(__clang__) 
     98  #include <intrin.h>     /* Needed for __cpuid */ 
     99 
     100/*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */ 
     101#if _MSC_FULL_VER >= 160040219 
     102  #include <immintrin.h>  /* Needed for _xgetbv */ 
     103#elif defined(_M_IX86) 
     104 
     105/*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */ 
     106 
     107static uint64_t _xgetbv(uint32_t xcr) { 
     108    uint32_t xcr0, xcr1; 
     109    __asm { 
     110        mov        ecx, xcr 
     111        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 
     112        mov        xcr0, eax 
     113        mov        xcr1, edx 
    37114    } 
    38   } 
    39   leftover = blocksize % bytesoftype; 
    40   memcpy(_dest + neblock*bytesoftype, _src + neblock*bytesoftype, leftover); 
    41 } 
    42  
    43 /* Unshuffle a block.  This can never fail. */ 
    44 static void _unshuffle(size_t bytesoftype, size_t blocksize, 
    45                        uint8_t* _src, uint8_t* _dest) 
    46 { 
    47   size_t i, j, neblock, leftover; 
    48  
    49   /* Non-optimized unshuffle */ 
    50   neblock = blocksize / bytesoftype;  /* Number of elements in a block */ 
    51   for (i = 0; i < neblock; i++) { 
    52     for (j = 0; j < bytesoftype; j++) { 
    53       _dest[i*bytesoftype+j] = _src[j*neblock+i]; 
    54     } 
    55   } 
    56   leftover = blocksize % bytesoftype; 
    57   memcpy(_dest+neblock*bytesoftype, _src+neblock*bytesoftype, leftover); 
    58 } 
    59  
    60  
    61 #ifdef __SSE2__ 
    62  
    63 /* The SSE2 versions of shuffle and unshuffle */ 
    64  
    65 #include <emmintrin.h> 
    66  
    67 /* The next is useful for debugging purposes */ 
    68 #if 0 
    69 static void printxmm(__m128i xmm0) 
    70 { 
    71   uint8_t buf[16]; 
    72  
    73   ((__m128i *)buf)[0] = xmm0; 
    74   printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", 
    75           buf[0], buf[1], buf[2], buf[3], 
    76           buf[4], buf[5], buf[6], buf[7], 
    77           buf[8], buf[9], buf[10], buf[11], 
    78           buf[12], buf[13], buf[14], buf[15]); 
    79 } 
    80 #endif 
    81  
    82  
    83 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ 
    84 static void 
    85 shuffle2(uint8_t* dest, uint8_t* src, size_t size) 
    86 { 
    87   size_t i, j, k; 
    88   size_t numof16belem; 
    89   __m128i xmm0[2], xmm1[2]; 
    90  
    91   numof16belem = size / (16*2); 
    92   for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) { 
    93     /* Fetch and transpose bytes, words and double words in groups of 
    94        32 bytes */ 
    95     for (k = 0; k < 2; k++) { 
    96       xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 
    97       xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); 
    98       xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); 
    99       xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 
    100       xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); 
    101       xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); 
    102       xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 
    103       xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); 
    104       xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); 
    105       xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 
    106     } 
    107     /* Transpose quad words */ 
    108     for (k = 0; k < 1; k++) { 
    109       xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); 
    110       xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); 
    111     } 
    112     /* Store the result vectors */ 
    113     for (k = 0; k < 2; k++) { 
    114       ((__m128i *)dest)[k*numof16belem+i] = xmm1[k]; 
    115     } 
    116   } 
    117 } 
    118  
    119  
    120 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ 
    121 static void 
    122 shuffle4(uint8_t* dest, uint8_t* src, size_t size) 
    123 { 
    124   size_t i, j, k; 
    125   size_t numof16belem; 
    126   __m128i xmm0[4], xmm1[4]; 
    127  
    128   numof16belem = size / (16*4); 
    129   for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) { 
    130     /* Fetch and transpose bytes and words in groups of 64 bytes */ 
    131     for (k = 0; k < 4; k++) { 
    132       xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 
    133       xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 
    134       xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d); 
    135       xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]); 
    136       xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e); 
    137       xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); 
    138     } 
    139     /* Transpose double words */ 
    140     for (k = 0; k < 2; k++) { 
    141       xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]); 
    142       xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]); 
    143     } 
    144     /* Transpose quad words */ 
    145     for (k = 0; k < 2; k++) { 
    146       xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]); 
    147       xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]); 
    148     } 
    149     /* Store the result vectors */ 
    150     for (k = 0; k < 4; k++) { 
    151       ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; 
    152     } 
    153   } 
    154 } 
    155  
    156  
    157 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ 
    158 static void 
    159 shuffle8(uint8_t* dest, uint8_t* src, size_t size) 
    160 { 
    161   size_t i, j, k, l; 
    162   size_t numof16belem; 
    163   __m128i xmm0[8], xmm1[8]; 
    164  
    165   numof16belem = size / (16*8); 
    166   for (i = 0, j = 0; i < numof16belem; i++, j += 16*8) { 
    167     /* Fetch and transpose bytes in groups of 128 bytes */ 
    168     for (k = 0; k < 8; k++) { 
    169       xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 
    170       xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); 
    171       xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); 
    172     } 
    173     /* Transpose words */ 
    174     for (k = 0, l = 0; k < 4; k++, l +=2) { 
    175       xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]); 
    176       xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]); 
    177     } 
    178     /* Transpose double words */ 
    179     for (k = 0, l = 0; k < 4; k++, l++) { 
    180       if (k == 2) l += 2; 
    181       xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]); 
    182       xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]); 
    183     } 
    184     /* Transpose quad words */ 
    185     for (k = 0; k < 4; k++) { 
    186       xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]); 
    187       xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]); 
    188     } 
    189     /* Store the result vectors */ 
    190     for (k = 0; k < 8; k++) { 
    191       ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; 
    192     } 
    193   } 
    194 } 
    195  
    196  
    197 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ 
    198 static void 
    199 shuffle16(uint8_t* dest, uint8_t* src, size_t size) 
    200 { 
    201   size_t i, j, k, l; 
    202   size_t numof16belem; 
    203   __m128i xmm0[16], xmm1[16]; 
    204  
    205   numof16belem = size / (16*16); 
    206   for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) { 
    207     /* Fetch elements in groups of 256 bytes */ 
    208     for (k = 0; k < 16; k++) { 
    209       xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 
    210     } 
    211     /* Transpose bytes */ 
    212     for (k = 0, l = 0; k < 8; k++, l +=2) { 
    213       xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]); 
    214       xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]); 
    215     } 
    216     /* Transpose words */ 
    217     for (k = 0, l = -2; k < 8; k++, l++) { 
    218       if ((k%2) == 0) l += 2; 
    219       xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]); 
    220       xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]); 
    221     } 
    222     /* Transpose double words */ 
    223     for (k = 0, l = -4; k < 8; k++, l++) { 
    224       if ((k%4) == 0) l += 4; 
    225       xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]); 
    226       xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]); 
    227     } 
    228     /* Transpose quad words */ 
    229     for (k = 0; k < 8; k++) { 
    230       xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]); 
    231       xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]); 
    232     } 
    233     /* Store the result vectors */ 
    234     for (k = 0; k < 16; k++) { 
    235       ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; 
    236     } 
    237   } 
    238 } 
    239  
    240  
    241 /* Shuffle a block.  This can never fail. */ 
    242 void shuffle(size_t bytesoftype, size_t blocksize, 
    243              uint8_t* _src, uint8_t* _dest) { 
    244   int unaligned_dest = (int)((uintptr_t)_dest % 16); 
    245   int power_of_two = (blocksize & (blocksize - 1)) == 0; 
    246   int too_small = (blocksize < 256); 
    247  
    248   if (unaligned_dest || !power_of_two || too_small) { 
    249     /* _dest buffer is not aligned, not a power of two or is too 
    250        small.  Call the non-sse2 version. */ 
    251     _shuffle(bytesoftype, blocksize, _src, _dest); 
    252     return; 
    253   } 
    254  
    255   /* Optimized shuffle */ 
    256   /* The buffer must be aligned on a 16 bytes boundary, have a power */ 
    257   /* of 2 size and be larger or equal than 256 bytes. */ 
    258   if (bytesoftype == 4) { 
    259     shuffle4(_dest, _src, blocksize); 
    260   } 
    261   else if (bytesoftype == 8) { 
    262     shuffle8(_dest, _src, blocksize); 
    263   } 
    264   else if (bytesoftype == 16) { 
    265     shuffle16(_dest, _src, blocksize); 
    266   } 
    267   else if (bytesoftype == 2) { 
    268     shuffle2(_dest, _src, blocksize); 
    269   } 
    270   else { 
    271     /* Non-optimized shuffle */ 
    272     _shuffle(bytesoftype, blocksize, _src, _dest); 
    273   } 
    274 } 
    275  
    276  
    277 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ 
    278 static void 
    279 unshuffle2(uint8_t* dest, uint8_t* orig, size_t size) 
    280 { 
    281   size_t i, k; 
    282   size_t neblock, numof16belem; 
    283   __m128i xmm1[2], xmm2[2]; 
    284  
    285   neblock = size / 2; 
    286   numof16belem = neblock / 16; 
    287   for (i = 0, k = 0; i < numof16belem; i++, k += 2) { 
    288     /* Load the first 32 bytes in 2 XMM registrers */ 
    289     xmm1[0] = ((__m128i *)orig)[0*numof16belem+i]; 
    290     xmm1[1] = ((__m128i *)orig)[1*numof16belem+i]; 
    291     /* Shuffle bytes */ 
    292     /* Compute the low 32 bytes */ 
    293     xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]); 
    294     /* Compute the hi 32 bytes */ 
    295     xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]); 
    296     /* Store the result vectors in proper order */ 
    297     ((__m128i *)dest)[k+0] = xmm2[0]; 
    298     ((__m128i *)dest)[k+1] = xmm2[1]; 
    299   } 
    300 } 
    301  
    302  
    303 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ 
    304 static void 
    305 unshuffle4(uint8_t* dest, uint8_t* orig, size_t size) 
    306 { 
    307   size_t i, j, k; 
    308   size_t neblock, numof16belem; 
    309   __m128i xmm0[4], xmm1[4]; 
    310  
    311   neblock = size / 4; 
    312   numof16belem = neblock / 16; 
    313   for (i = 0, k = 0; i < numof16belem; i++, k += 4) { 
    314     /* Load the first 64 bytes in 4 XMM registrers */ 
    315     for (j = 0; j < 4; j++) { 
    316       xmm0[j] = ((__m128i *)orig)[j*numof16belem+i]; 
    317     } 
    318     /* Shuffle bytes */ 
    319     for (j = 0; j < 2; j++) { 
    320       /* Compute the low 32 bytes */ 
    321       xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); 
    322       /* Compute the hi 32 bytes */ 
    323       xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); 
    324     } 
    325     /* Shuffle 2-byte words */ 
    326     for (j = 0; j < 2; j++) { 
    327       /* Compute the low 32 bytes */ 
    328       xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); 
    329       /* Compute the hi 32 bytes */ 
    330       xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); 
    331     } 
    332     /* Store the result vectors in proper order */ 
    333     ((__m128i *)dest)[k+0] = xmm0[0]; 
    334     ((__m128i *)dest)[k+1] = xmm0[2]; 
    335     ((__m128i *)dest)[k+2] = xmm0[1]; 
    336     ((__m128i *)dest)[k+3] = xmm0[3]; 
    337   } 
    338 } 
    339  
    340  
    341 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ 
    342 static void 
    343 unshuffle8(uint8_t* dest, uint8_t* orig, size_t size) 
    344 { 
    345   size_t i, j, k; 
    346   size_t neblock, numof16belem; 
    347   __m128i xmm0[8], xmm1[8]; 
    348  
    349   neblock = size / 8; 
    350   numof16belem = neblock / 16; 
    351   for (i = 0, k = 0; i < numof16belem; i++, k += 8) { 
    352     /* Load the first 64 bytes in 8 XMM registrers */ 
    353     for (j = 0; j < 8; j++) { 
    354       xmm0[j] = ((__m128i *)orig)[j*numof16belem+i]; 
    355     } 
    356     /* Shuffle bytes */ 
    357     for (j = 0; j < 4; j++) { 
    358       /* Compute the low 32 bytes */ 
    359       xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); 
    360       /* Compute the hi 32 bytes */ 
    361       xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); 
    362     } 
    363     /* Shuffle 2-byte words */ 
    364     for (j = 0; j < 4; j++) { 
    365       /* Compute the low 32 bytes */ 
    366       xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); 
    367       /* Compute the hi 32 bytes */ 
    368       xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); 
    369     } 
    370     /* Shuffle 4-byte dwords */ 
    371     for (j = 0; j < 4; j++) { 
    372       /* Compute the low 32 bytes */ 
    373       xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]); 
    374       /* Compute the hi 32 bytes */ 
    375       xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]); 
    376     } 
    377     /* Store the result vectors in proper order */ 
    378     ((__m128i *)dest)[k+0] = xmm1[0]; 
    379     ((__m128i *)dest)[k+1] = xmm1[4]; 
    380     ((__m128i *)dest)[k+2] = xmm1[2]; 
    381     ((__m128i *)dest)[k+3] = xmm1[6]; 
    382     ((__m128i *)dest)[k+4] = xmm1[1]; 
    383     ((__m128i *)dest)[k+5] = xmm1[5]; 
    384     ((__m128i *)dest)[k+6] = xmm1[3]; 
    385     ((__m128i *)dest)[k+7] = xmm1[7]; 
    386   } 
    387 } 
    388  
    389  
    390 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ 
    391 static void 
    392 unshuffle16(uint8_t* dest, uint8_t* orig, size_t size) 
    393 { 
    394   size_t i, j, k; 
    395   size_t neblock, numof16belem; 
    396   __m128i xmm1[16], xmm2[16]; 
    397  
    398   neblock = size / 16; 
    399   numof16belem = neblock / 16; 
    400   for (i = 0, k = 0; i < numof16belem; i++, k += 16) { 
    401     /* Load the first 128 bytes in 16 XMM registrers */ 
    402     for (j = 0; j < 16; j++) { 
    403       xmm1[j] = ((__m128i *)orig)[j*numof16belem+i]; 
    404     } 
    405     /* Shuffle bytes */ 
    406     for (j = 0; j < 8; j++) { 
    407       /* Compute the low 32 bytes */ 
    408       xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); 
    409       /* Compute the hi 32 bytes */ 
    410       xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); 
    411     } 
    412     /* Shuffle 2-byte words */ 
    413     for (j = 0; j < 8; j++) { 
    414       /* Compute the low 32 bytes */ 
    415       xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); 
    416       /* Compute the hi 32 bytes */ 
    417       xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); 
    418     } 
    419     /* Shuffle 4-byte dwords */ 
    420     for (j = 0; j < 8; j++) { 
    421       /* Compute the low 32 bytes */ 
    422       xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); 
    423       /* Compute the hi 32 bytes */ 
    424       xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); 
    425     } 
    426     /* Shuffle 8-byte qwords */ 
    427     for (j = 0; j < 8; j++) { 
    428       /* Compute the low 32 bytes */ 
    429       xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); 
    430       /* Compute the hi 32 bytes */ 
    431       xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); 
    432     } 
    433     /* Store the result vectors in proper order */ 
    434     ((__m128i *)dest)[k+0] = xmm1[0]; 
    435     ((__m128i *)dest)[k+1] = xmm1[8]; 
    436     ((__m128i *)dest)[k+2] = xmm1[4]; 
    437     ((__m128i *)dest)[k+3] = xmm1[12]; 
    438     ((__m128i *)dest)[k+4] = xmm1[2]; 
    439     ((__m128i *)dest)[k+5] = xmm1[10]; 
    440     ((__m128i *)dest)[k+6] = xmm1[6]; 
    441     ((__m128i *)dest)[k+7] = xmm1[14]; 
    442     ((__m128i *)dest)[k+8] = xmm1[1]; 
    443     ((__m128i *)dest)[k+9] = xmm1[9]; 
    444     ((__m128i *)dest)[k+10] = xmm1[5]; 
    445     ((__m128i *)dest)[k+11] = xmm1[13]; 
    446     ((__m128i *)dest)[k+12] = xmm1[3]; 
    447     ((__m128i *)dest)[k+13] = xmm1[11]; 
    448     ((__m128i *)dest)[k+14] = xmm1[7]; 
    449     ((__m128i *)dest)[k+15] = xmm1[15]; 
    450   } 
    451 } 
    452  
    453  
    454 /* Unshuffle a block.  This can never fail. */ 
    455 void unshuffle(size_t bytesoftype, size_t blocksize, 
    456                uint8_t* _src, uint8_t* _dest) { 
    457   int unaligned_src = (int)((uintptr_t)_src % 16); 
    458   int unaligned_dest = (int)((uintptr_t)_dest % 16); 
    459   int power_of_two = (blocksize & (blocksize - 1)) == 0; 
    460   int too_small = (blocksize < 256); 
    461  
    462   if (unaligned_src || unaligned_dest || !power_of_two || too_small) { 
    463     /* _src or _dest buffer is not aligned, not a power of two or is 
    464        too small.  Call the non-sse2 version. */ 
    465     _unshuffle(bytesoftype, blocksize, _src, _dest); 
    466     return; 
    467   } 
    468  
    469   /* Optimized unshuffle */ 
    470   /* The buffers must be aligned on a 16 bytes boundary, have a power */ 
    471   /* of 2 size and be larger or equal than 256 bytes. */ 
    472   if (bytesoftype == 4) { 
    473     unshuffle4(_dest, _src, blocksize); 
    474   } 
    475   else if (bytesoftype == 8) { 
    476     unshuffle8(_dest, _src, blocksize); 
    477   } 
    478   else if (bytesoftype == 16) { 
    479     unshuffle16(_dest, _src, blocksize); 
    480   } 
    481   else if (bytesoftype == 2) { 
    482     unshuffle2(_dest, _src, blocksize); 
    483   } 
    484   else { 
    485     /* Non-optimized unshuffle */ 
    486     _unshuffle(bytesoftype, blocksize, _src, _dest); 
    487   } 
    488 } 
    489  
    490 #else   /* no __SSE2__ available */ 
    491  
    492 void shuffle(size_t bytesoftype, size_t blocksize, 
    493              uint8_t* _src, uint8_t* _dest) { 
    494   _shuffle(bytesoftype, blocksize, _src, _dest); 
    495 } 
    496  
    497 void unshuffle(size_t bytesoftype, size_t blocksize, 
    498                uint8_t* _src, uint8_t* _dest) { 
    499   _unshuffle(bytesoftype, blocksize, _src, _dest); 
    500 } 
    501  
    502 #endif  /* __SSE2__ */ 
     115    return ((uint64_t)xcr1 << 32) | xcr0; 
     116} 
     117 
     118#elif defined(_M_X64) 
     119 
     120/*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets. 
     121    These compilers don't support any of the newer acceleration ISAs 
     122    (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2 
     123    which means we can get away with returning a hard-coded value from 
     124    this implementation of _xgetbv. */ 
     125 
     126static inline uint64_t 
     127_xgetbv(uint32_t xcr) { 
     128    /* A 64-bit OS must have XMM save support. */ 
     129    return xcr == 0 ? (1UL << 1) : 0UL; 
     130} 
     131 
     132#else 
     133 
     134/* Hardware detection for any other MSVC targets (e.g., ARM) 
     135   isn't implemented at this time. */ 
     136#error This version of c-blosc only supports x86 and x64 targets with MSVC. 
     137 
     138#endif /* _MSC_FULL_VER >= 160040219 */ 
     139   
     140#else 
     141 
     142/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang, 
     143    and others using inline assembly. */ 
     144__attribute__((always_inline)) 
     145static inline void 
     146__cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) { 
     147  __asm__ __volatile__ ( 
     148# if defined(__i386__) && defined (__PIC__) 
     149  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored. 
     150      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family 
     151  */ 
     152    "movl %%ebx, %%edi\n\t" 
     153    "cpuid\n\t" 
     154    "xchgl %%ebx, %%edi": 
     155    "=D" (cpuInfo[1]), 
     156#else 
     157    "cpuid": 
     158    "=b" (cpuInfo[1]), 
     159#endif  /* defined(__i386) && defined(__PIC__) */ 
     160    "=a" (cpuInfo[0]), 
     161    "=c" (cpuInfo[2]), 
     162    "=d" (cpuInfo[3]) : 
     163    "a" (function_id), "c" (subfunction_id) 
     164    ); 
     165} 
     166 
     167#define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0) 
     168 
     169#define _XCR_XFEATURE_ENABLED_MASK 0 
     170 
     171/* Reads the content of an extended control register. 
     172   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family 
     173*/ 
     174static inline uint64_t 
     175_xgetbv(uint32_t xcr) { 
     176  uint32_t eax, edx; 
     177  __asm__ __volatile__ ( 
     178    /* "xgetbv" 
     179       This is specified as raw instruction bytes due to some older compilers 
     180       having issues with the mnemonic form. 
     181    */ 
     182    ".byte 0x0f, 0x01, 0xd0": 
     183    "=a" (eax), 
     184    "=d" (edx) : 
     185    "c" (xcr) 
     186    ); 
     187  return ((uint64_t)edx << 32) | eax; 
     188} 
     189 
     190#endif /* defined(_MSC_FULL_VER) */ 
     191 
     192#ifndef _XCR_XFEATURE_ENABLED_MASK 
     193#define _XCR_XFEATURE_ENABLED_MASK 0x0 
     194#endif 
     195 
     196static blosc_cpu_features blosc_get_cpu_features(void) { 
     197  blosc_cpu_features result = BLOSC_HAVE_NOTHING; 
     198  int32_t max_basic_function_id; 
     199  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */ 
     200  int32_t cpu_info[4]; 
     201  int sse2_available; 
     202  int sse3_available; 
     203  int ssse3_available; 
     204  int sse41_available; 
     205  int sse42_available; 
     206  int xsave_available; 
     207  int xsave_enabled_by_os; 
     208  int avx2_available = 0; 
     209  int avx512bw_available = 0; 
     210  int xmm_state_enabled = 0; 
     211  int ymm_state_enabled = 0; 
     212  int zmm_state_enabled = 0; 
     213  uint64_t xcr0_contents; 
     214 
     215  /* Get the number of basic functions available. */ 
     216  __cpuid(cpu_info, 0); 
     217  max_basic_function_id = cpu_info[0]; 
     218 
     219  /* Check for SSE-based features and required OS support */ 
     220  __cpuid(cpu_info, 1); 
     221  sse2_available = (cpu_info[3] & (1 << 26)) != 0; 
     222  sse3_available = (cpu_info[2] & (1 << 0)) != 0; 
     223  ssse3_available = (cpu_info[2] & (1 << 9)) != 0; 
     224  sse41_available = (cpu_info[2] & (1 << 19)) != 0; 
     225  sse42_available = (cpu_info[2] & (1 << 20)) != 0; 
     226 
     227  xsave_available = (cpu_info[2] & (1 << 26)) != 0; 
     228  xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0; 
     229 
     230  /* Check for AVX-based features, if the processor supports extended features. */ 
     231  if (max_basic_function_id >= 7) { 
     232    __cpuid(cpu_info, 7); 
     233    avx2_available = (cpu_info[1] & (1 << 5)) != 0; 
     234    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0; 
     235  } 
     236 
     237  /*  Even if certain features are supported by the CPU, they may not be supported 
     238      by the OS (in which case using them would crash the process or system). 
     239      If xsave is available and enabled by the OS, check the contents of the 
     240      extended control register XCR0 to see if the CPU features are enabled. */ 
     241#if defined(_XCR_XFEATURE_ENABLED_MASK) 
     242  if (xsave_available && xsave_enabled_by_os && ( 
     243      sse2_available || sse3_available || ssse3_available 
     244      || sse41_available || sse42_available 
     245      || avx2_available || avx512bw_available)) { 
     246    /* Determine which register states can be restored by the OS. */ 
     247    xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); 
     248 
     249    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0; 
     250    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0; 
     251 
     252    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be 
     253        restored as well as all of zmm16-zmm31 and the opmask registers. */ 
     254    zmm_state_enabled = (xcr0_contents & 0x70) == 0x70; 
     255  } 
     256#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */ 
     257 
     258#if defined(BLOSC_DUMP_CPU_INFO) 
     259  printf("Shuffle CPU Information:\n"); 
     260  printf("SSE2 available: %s\n", sse2_available ? "True" : "False"); 
     261  printf("SSE3 available: %s\n", sse3_available ? "True" : "False"); 
     262  printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False"); 
     263  printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False"); 
     264  printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False"); 
     265  printf("AVX2 available: %s\n", avx2_available ? "True" : "False"); 
     266  printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False"); 
     267  printf("XSAVE available: %s\n", xsave_available ? "True" : "False"); 
     268  printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False"); 
     269  printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False"); 
     270  printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False"); 
     271  printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False"); 
     272#endif /* defined(BLOSC_DUMP_CPU_INFO) */ 
     273 
     274  /* Using the gathered CPU information, determine which implementation to use. */ 
     275  /* technically could fail on sse2 cpu on os without xmm support, but that 
     276   * shouldn't exist anymore */ 
     277  if (sse2_available) { 
     278    result |= BLOSC_HAVE_SSE2; 
     279  } 
     280  if (xmm_state_enabled && ymm_state_enabled && avx2_available) { 
     281    result |= BLOSC_HAVE_AVX2; 
     282  } 
     283  return result; 
     284} 
     285#endif 
     286 
     287#else   /* No hardware acceleration supported for the target architecture. */ 
     288  #if defined(_MSC_VER) 
     289  #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.") 
     290  #else 
     291  #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available. 
     292  #endif 
     293 
     294static blosc_cpu_features blosc_get_cpu_features(void) { 
     295  return BLOSC_HAVE_NOTHING; 
     296} 
     297 
     298#endif 
     299 
     300static shuffle_implementation_t get_shuffle_implementation() { 
     301  blosc_cpu_features cpu_features = blosc_get_cpu_features(); 
     302  shuffle_implementation_t impl_generic; 
     303 
     304#if defined(SHUFFLE_AVX2_ENABLED) 
     305  if (cpu_features & BLOSC_HAVE_AVX2) { 
     306    shuffle_implementation_t impl_avx2; 
     307    impl_avx2.name = "avx2"; 
     308    impl_avx2.shuffle = (shuffle_func)shuffle_avx2; 
     309    impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2; 
     310    impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2; 
     311    impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2; 
     312    return impl_avx2; 
     313  } 
     314#endif  /* defined(SHUFFLE_AVX2_ENABLED) */ 
     315 
     316#if defined(SHUFFLE_SSE2_ENABLED) 
     317  if (cpu_features & BLOSC_HAVE_SSE2) { 
     318    shuffle_implementation_t impl_sse2; 
     319    impl_sse2.name = "sse2"; 
     320    impl_sse2.shuffle = (shuffle_func)shuffle_sse2; 
     321    impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2; 
     322    impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2; 
     323    impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2; 
     324    return impl_sse2; 
     325  } 
     326#endif  /* defined(SHUFFLE_SSE2_ENABLED) */ 
     327 
     328  /*  Processor doesn't support any of the hardware-accelerated implementations, 
     329      so use the generic implementation. */ 
     330  impl_generic.name = "generic"; 
     331  impl_generic.shuffle = (shuffle_func)shuffle_generic; 
     332  impl_generic.unshuffle = (unshuffle_func)unshuffle_generic; 
     333  impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal; 
     334  impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal; 
     335  return impl_generic; 
     336} 
     337 
     338 
     339/*  Flag indicating whether the implementation has been initialized. 
     340    Zero means it hasn't been initialized, non-zero means it has. */ 
     341static int32_t implementation_initialized; 
     342 
     343/*  The dynamically-chosen shuffle/unshuffle implementation. 
     344    This is only safe to use once `implementation_initialized` is set. */ 
     345static shuffle_implementation_t host_implementation; 
     346 
     347/*  Initialize the shuffle implementation, if necessary. */ 
     348#if defined(__GNUC__) || defined(__clang__) 
     349__attribute__((always_inline)) 
     350#endif 
     351static 
     352#if defined(_MSC_VER) 
     353__forceinline 
     354#else 
     355inline 
     356#endif 
     357void init_shuffle_implementation() { 
     358  /* Initialization could (in rare cases) take place concurrently on 
     359     multiple threads, but it shouldn't matter because the 
     360     initialization should return the same result on each thread (so 
     361     the implementation will be the same). Since that's the case we 
     362     can avoid complicated synchronization here and get a small 
     363     performance benefit because we don't need to perform a volatile 
     364     load on the initialization variable each time this function is 
     365     called. */ 
     366#if defined(__GNUC__) || defined(__clang__) 
     367  if (__builtin_expect(!implementation_initialized, 0)) { 
     368#else 
     369  if (!implementation_initialized) { 
     370#endif 
     371    /* Initialize the implementation. */ 
     372    host_implementation = get_shuffle_implementation(); 
     373 
     374    /*  Set the flag indicating the implementation has been initialized. */ 
     375    implementation_initialized = 1; 
     376  } 
     377} 
     378 
     379/*  Shuffle a block by dynamically dispatching to the appropriate 
     380    hardware-accelerated routine at run-time. */ 
     381void 
     382shuffle(const size_t bytesoftype, const size_t blocksize, 
     383        const uint8_t* _src, const uint8_t* _dest) { 
     384  /* Initialize the shuffle implementation if necessary. */ 
     385  init_shuffle_implementation(); 
     386 
     387  /*  The implementation is initialized. 
     388      Dispatch to it's shuffle routine. */ 
     389  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest); 
     390} 
     391 
     392/*  Unshuffle a block by dynamically dispatching to the appropriate 
     393    hardware-accelerated routine at run-time. */ 
     394void 
     395unshuffle(const size_t bytesoftype, const size_t blocksize, 
     396          const uint8_t* _src, const uint8_t* _dest) { 
     397  /* Initialize the shuffle implementation if necessary. */ 
     398  init_shuffle_implementation(); 
     399 
     400  /*  The implementation is initialized. 
     401      Dispatch to it's unshuffle routine. */ 
     402  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest); 
     403} 
     404 
     405/*  Bit-shuffle a block by dynamically dispatching to the appropriate 
     406    hardware-accelerated routine at run-time. */ 
     407int 
     408bitshuffle(const size_t bytesoftype, const size_t blocksize, 
     409           const uint8_t* const _src, const uint8_t* _dest, 
     410           const uint8_t* _tmp) { 
     411  int size = blocksize / bytesoftype; 
     412  /* Initialize the shuffle implementation if necessary. */ 
     413  init_shuffle_implementation(); 
     414 
     415  if ((size % 8) == 0) 
     416    /* The number of elems is a multiple of 8 which is supported by 
     417       bitshuffle. */ 
     418    return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest, 
     419                                                 blocksize / bytesoftype, 
     420                                                 bytesoftype, (void*)_tmp); 
     421  else 
     422    memcpy((void*)_dest, (void*)_src, blocksize); 
     423  return size; 
     424} 
     425 
     426/*  Bit-unshuffle a block by dynamically dispatching to the appropriate 
     427    hardware-accelerated routine at run-time. */ 
     428int 
     429bitunshuffle(const size_t bytesoftype, const size_t blocksize, 
     430             const uint8_t* const _src, const uint8_t* _dest, 
     431             const uint8_t* _tmp) { 
     432  int size = blocksize / bytesoftype; 
     433  /* Initialize the shuffle implementation if necessary. */ 
     434  init_shuffle_implementation(); 
     435 
     436  if ((size % 8) == 0) 
     437    /* The number of elems is a multiple of 8 which is supported by 
     438       bitshuffle. */ 
     439    return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest, 
     440                                                   blocksize / bytesoftype, 
     441                                                   bytesoftype, (void*)_tmp); 
     442  else 
     443    memcpy((void*)_dest, (void*)_src, blocksize); 
     444  return size; 
     445} 
Note: See TracChangeset for help on using the changeset viewer.