Changeset 981e22c for thirdparty/blosc/shuffle.c
- Timestamp:
- 08/26/16 19:35:26 (8 years ago)
- Branches:
- master, pympi
- Children:
- 8ebc79b
- Parents:
- cda87e9
- git-author:
- Hal Finkel <hfinkel@…> (08/26/16 19:35:26)
- git-committer:
- Hal Finkel <hfinkel@…> (08/26/16 19:35:26)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
thirdparty/blosc/shuffle.c
r00587dc r981e22c 1 1 /********************************************************************* 2 Blosc - Blocked S uffling and Compression Library3 4 Author: Francesc Alted <f [email protected]>2 Blosc - Blocked Shuffling and Compression Library 3 4 Author: Francesc Alted <f[email protected]> 5 5 Creation date: 2009-05-20 6 6 … … 8 8 **********************************************************************/ 9 9 10 #include "shuffle.h" 11 #include "shuffle-common.h" 12 #include "shuffle-generic.h" 13 #include "bitshuffle-generic.h" 10 14 #include <stdio.h> 11 15 #include <string.h> 12 #include "shuffle.h" 13 14 #if defined(_WIN32) && !defined(__MINGW32__) 15 #include <windows.h> 16 #include "win32/stdint-windows.h" 17 #define __SSE2__ /* Windows does not define this by default */ 18 #else 19 #include <stdint.h> 20 #include <inttypes.h> 21 #endif /* _WIN32 */ 22 23 24 /* The non-SSE2 versions of shuffle and unshuffle */ 25 26 /* Shuffle a block. This can never fail. */ 27 static void _shuffle(size_t bytesoftype, size_t blocksize, 28 uint8_t* _src, uint8_t* _dest) 29 { 30 size_t i, j, neblock, leftover; 31 32 /* Non-optimized shuffle */ 33 neblock = blocksize / bytesoftype; /* Number of elements in a block */ 34 for (j = 0; j < bytesoftype; j++) { 35 for (i = 0; i < neblock; i++) { 36 _dest[j*neblock+i] = _src[i*bytesoftype+j]; 16 17 /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */ 18 #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L 19 /* have a C99 compiler */ 20 typedef _Bool bool; 21 #else 22 /* do not have a C99 compiler */ 23 typedef unsigned char bool; 24 #endif 25 static const bool false = 0; 26 static const bool true = 1; 27 28 29 #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ 30 __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) 31 #define HAVE_CPU_FEAT_INTRIN 32 #endif 33 34 /* Include hardware-accelerated shuffle/unshuffle routines based on 35 the target architecture. Note that a target architecture may support 36 more than one type of acceleration!*/ 37 #if defined(SHUFFLE_AVX2_ENABLED) 38 #include "shuffle-avx2.h" 39 #include "bitshuffle-avx2.h" 40 #endif /* defined(SHUFFLE_AVX2_ENABLED) */ 41 42 #if defined(SHUFFLE_SSE2_ENABLED) 43 #include "shuffle-sse2.h" 44 #include "bitshuffle-sse2.h" 45 #endif /* defined(SHUFFLE_SSE2_ENABLED) */ 46 47 48 /* Define function pointer types for shuffle/unshuffle routines. */ 49 typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*); 50 typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*); 51 typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*); 52 typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*); 53 54 /* An implementation of shuffle/unshuffle routines. */ 55 typedef struct shuffle_implementation { 56 /* Name of this implementation. */ 57 const char* name; 58 /* Function pointer to the shuffle routine for this implementation. */ 59 shuffle_func shuffle; 60 /* Function pointer to the unshuffle routine for this implementation. */ 61 unshuffle_func unshuffle; 62 /* Function pointer to the bitshuffle routine for this implementation. */ 63 bitshuffle_func bitshuffle; 64 /* Function pointer to the bitunshuffle routine for this implementation. */ 65 bitunshuffle_func bitunshuffle; 66 } shuffle_implementation_t; 67 68 typedef enum { 69 BLOSC_HAVE_NOTHING = 0, 70 BLOSC_HAVE_SSE2 = 1, 71 BLOSC_HAVE_AVX2 = 2 72 } blosc_cpu_features; 73 74 /* Detect hardware and set function pointers to the best shuffle/unshuffle 75 implementations supported by the host processor. */ 76 #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) /* Intel/i686 */ 77 78 /* Disabled the __builtin_cpu_supports() call, as it has issues with 79 new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial: 80 "undefined symbol: __cpu_model" 81 For a similar report, see: 82 https://lists.fedoraproject.org/archives/list/[email protected]/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/ 83 */ 84 #if defined(HAVE_CPU_FEAT_INTRIN) && 0 85 static blosc_cpu_features blosc_get_cpu_features(void) { 86 blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING; 87 if (__builtin_cpu_supports("sse2")) { 88 cpu_features |= BLOSC_HAVE_SSE2; 89 } 90 if (__builtin_cpu_supports("avx2")) { 91 cpu_features |= BLOSC_HAVE_AVX2; 92 } 93 return cpu_features; 94 } 95 #else 96 97 #if defined(_MSC_VER) && !defined(__clang__) 98 #include <intrin.h> /* Needed for __cpuid */ 99 100 /* _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */ 101 #if _MSC_FULL_VER >= 160040219 102 #include <immintrin.h> /* Needed for _xgetbv */ 103 #elif defined(_M_IX86) 104 105 /* Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */ 106 107 static uint64_t _xgetbv(uint32_t xcr) { 108 uint32_t xcr0, xcr1; 109 __asm { 110 mov ecx, xcr 111 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 112 mov xcr0, eax 113 mov xcr1, edx 37 114 } 38 } 39 leftover = blocksize % bytesoftype; 40 memcpy(_dest + neblock*bytesoftype, _src + neblock*bytesoftype, leftover); 41 } 42 43 /* Unshuffle a block. This can never fail. */ 44 static void _unshuffle(size_t bytesoftype, size_t blocksize, 45 uint8_t* _src, uint8_t* _dest) 46 { 47 size_t i, j, neblock, leftover; 48 49 /* Non-optimized unshuffle */ 50 neblock = blocksize / bytesoftype; /* Number of elements in a block */ 51 for (i = 0; i < neblock; i++) { 52 for (j = 0; j < bytesoftype; j++) { 53 _dest[i*bytesoftype+j] = _src[j*neblock+i]; 54 } 55 } 56 leftover = blocksize % bytesoftype; 57 memcpy(_dest+neblock*bytesoftype, _src+neblock*bytesoftype, leftover); 58 } 59 60 61 #ifdef __SSE2__ 62 63 /* The SSE2 versions of shuffle and unshuffle */ 64 65 #include <emmintrin.h> 66 67 /* The next is useful for debugging purposes */ 68 #if 0 69 static void printxmm(__m128i xmm0) 70 { 71 uint8_t buf[16]; 72 73 ((__m128i *)buf)[0] = xmm0; 74 printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", 75 buf[0], buf[1], buf[2], buf[3], 76 buf[4], buf[5], buf[6], buf[7], 77 buf[8], buf[9], buf[10], buf[11], 78 buf[12], buf[13], buf[14], buf[15]); 79 } 80 #endif 81 82 83 /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ 84 static void 85 shuffle2(uint8_t* dest, uint8_t* src, size_t size) 86 { 87 size_t i, j, k; 88 size_t numof16belem; 89 __m128i xmm0[2], xmm1[2]; 90 91 numof16belem = size / (16*2); 92 for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) { 93 /* Fetch and transpose bytes, words and double words in groups of 94 32 bytes */ 95 for (k = 0; k < 2; k++) { 96 xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 97 xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); 98 xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); 99 xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 100 xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); 101 xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); 102 xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 103 xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); 104 xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); 105 xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 106 } 107 /* Transpose quad words */ 108 for (k = 0; k < 1; k++) { 109 xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); 110 xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); 111 } 112 /* Store the result vectors */ 113 for (k = 0; k < 2; k++) { 114 ((__m128i *)dest)[k*numof16belem+i] = xmm1[k]; 115 } 116 } 117 } 118 119 120 /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ 121 static void 122 shuffle4(uint8_t* dest, uint8_t* src, size_t size) 123 { 124 size_t i, j, k; 125 size_t numof16belem; 126 __m128i xmm0[4], xmm1[4]; 127 128 numof16belem = size / (16*4); 129 for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) { 130 /* Fetch and transpose bytes and words in groups of 64 bytes */ 131 for (k = 0; k < 4; k++) { 132 xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 133 xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); 134 xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d); 135 xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]); 136 xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e); 137 xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); 138 } 139 /* Transpose double words */ 140 for (k = 0; k < 2; k++) { 141 xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]); 142 xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]); 143 } 144 /* Transpose quad words */ 145 for (k = 0; k < 2; k++) { 146 xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]); 147 xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]); 148 } 149 /* Store the result vectors */ 150 for (k = 0; k < 4; k++) { 151 ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; 152 } 153 } 154 } 155 156 157 /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ 158 static void 159 shuffle8(uint8_t* dest, uint8_t* src, size_t size) 160 { 161 size_t i, j, k, l; 162 size_t numof16belem; 163 __m128i xmm0[8], xmm1[8]; 164 165 numof16belem = size / (16*8); 166 for (i = 0, j = 0; i < numof16belem; i++, j += 16*8) { 167 /* Fetch and transpose bytes in groups of 128 bytes */ 168 for (k = 0; k < 8; k++) { 169 xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 170 xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); 171 xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); 172 } 173 /* Transpose words */ 174 for (k = 0, l = 0; k < 4; k++, l +=2) { 175 xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]); 176 xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]); 177 } 178 /* Transpose double words */ 179 for (k = 0, l = 0; k < 4; k++, l++) { 180 if (k == 2) l += 2; 181 xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]); 182 xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]); 183 } 184 /* Transpose quad words */ 185 for (k = 0; k < 4; k++) { 186 xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]); 187 xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]); 188 } 189 /* Store the result vectors */ 190 for (k = 0; k < 8; k++) { 191 ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; 192 } 193 } 194 } 195 196 197 /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ 198 static void 199 shuffle16(uint8_t* dest, uint8_t* src, size_t size) 200 { 201 size_t i, j, k, l; 202 size_t numof16belem; 203 __m128i xmm0[16], xmm1[16]; 204 205 numof16belem = size / (16*16); 206 for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) { 207 /* Fetch elements in groups of 256 bytes */ 208 for (k = 0; k < 16; k++) { 209 xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); 210 } 211 /* Transpose bytes */ 212 for (k = 0, l = 0; k < 8; k++, l +=2) { 213 xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]); 214 xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]); 215 } 216 /* Transpose words */ 217 for (k = 0, l = -2; k < 8; k++, l++) { 218 if ((k%2) == 0) l += 2; 219 xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]); 220 xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]); 221 } 222 /* Transpose double words */ 223 for (k = 0, l = -4; k < 8; k++, l++) { 224 if ((k%4) == 0) l += 4; 225 xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]); 226 xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]); 227 } 228 /* Transpose quad words */ 229 for (k = 0; k < 8; k++) { 230 xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]); 231 xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]); 232 } 233 /* Store the result vectors */ 234 for (k = 0; k < 16; k++) { 235 ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; 236 } 237 } 238 } 239 240 241 /* Shuffle a block. This can never fail. */ 242 void shuffle(size_t bytesoftype, size_t blocksize, 243 uint8_t* _src, uint8_t* _dest) { 244 int unaligned_dest = (int)((uintptr_t)_dest % 16); 245 int power_of_two = (blocksize & (blocksize - 1)) == 0; 246 int too_small = (blocksize < 256); 247 248 if (unaligned_dest || !power_of_two || too_small) { 249 /* _dest buffer is not aligned, not a power of two or is too 250 small. Call the non-sse2 version. */ 251 _shuffle(bytesoftype, blocksize, _src, _dest); 252 return; 253 } 254 255 /* Optimized shuffle */ 256 /* The buffer must be aligned on a 16 bytes boundary, have a power */ 257 /* of 2 size and be larger or equal than 256 bytes. */ 258 if (bytesoftype == 4) { 259 shuffle4(_dest, _src, blocksize); 260 } 261 else if (bytesoftype == 8) { 262 shuffle8(_dest, _src, blocksize); 263 } 264 else if (bytesoftype == 16) { 265 shuffle16(_dest, _src, blocksize); 266 } 267 else if (bytesoftype == 2) { 268 shuffle2(_dest, _src, blocksize); 269 } 270 else { 271 /* Non-optimized shuffle */ 272 _shuffle(bytesoftype, blocksize, _src, _dest); 273 } 274 } 275 276 277 /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ 278 static void 279 unshuffle2(uint8_t* dest, uint8_t* orig, size_t size) 280 { 281 size_t i, k; 282 size_t neblock, numof16belem; 283 __m128i xmm1[2], xmm2[2]; 284 285 neblock = size / 2; 286 numof16belem = neblock / 16; 287 for (i = 0, k = 0; i < numof16belem; i++, k += 2) { 288 /* Load the first 32 bytes in 2 XMM registrers */ 289 xmm1[0] = ((__m128i *)orig)[0*numof16belem+i]; 290 xmm1[1] = ((__m128i *)orig)[1*numof16belem+i]; 291 /* Shuffle bytes */ 292 /* Compute the low 32 bytes */ 293 xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]); 294 /* Compute the hi 32 bytes */ 295 xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]); 296 /* Store the result vectors in proper order */ 297 ((__m128i *)dest)[k+0] = xmm2[0]; 298 ((__m128i *)dest)[k+1] = xmm2[1]; 299 } 300 } 301 302 303 /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ 304 static void 305 unshuffle4(uint8_t* dest, uint8_t* orig, size_t size) 306 { 307 size_t i, j, k; 308 size_t neblock, numof16belem; 309 __m128i xmm0[4], xmm1[4]; 310 311 neblock = size / 4; 312 numof16belem = neblock / 16; 313 for (i = 0, k = 0; i < numof16belem; i++, k += 4) { 314 /* Load the first 64 bytes in 4 XMM registrers */ 315 for (j = 0; j < 4; j++) { 316 xmm0[j] = ((__m128i *)orig)[j*numof16belem+i]; 317 } 318 /* Shuffle bytes */ 319 for (j = 0; j < 2; j++) { 320 /* Compute the low 32 bytes */ 321 xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); 322 /* Compute the hi 32 bytes */ 323 xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); 324 } 325 /* Shuffle 2-byte words */ 326 for (j = 0; j < 2; j++) { 327 /* Compute the low 32 bytes */ 328 xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); 329 /* Compute the hi 32 bytes */ 330 xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); 331 } 332 /* Store the result vectors in proper order */ 333 ((__m128i *)dest)[k+0] = xmm0[0]; 334 ((__m128i *)dest)[k+1] = xmm0[2]; 335 ((__m128i *)dest)[k+2] = xmm0[1]; 336 ((__m128i *)dest)[k+3] = xmm0[3]; 337 } 338 } 339 340 341 /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ 342 static void 343 unshuffle8(uint8_t* dest, uint8_t* orig, size_t size) 344 { 345 size_t i, j, k; 346 size_t neblock, numof16belem; 347 __m128i xmm0[8], xmm1[8]; 348 349 neblock = size / 8; 350 numof16belem = neblock / 16; 351 for (i = 0, k = 0; i < numof16belem; i++, k += 8) { 352 /* Load the first 64 bytes in 8 XMM registrers */ 353 for (j = 0; j < 8; j++) { 354 xmm0[j] = ((__m128i *)orig)[j*numof16belem+i]; 355 } 356 /* Shuffle bytes */ 357 for (j = 0; j < 4; j++) { 358 /* Compute the low 32 bytes */ 359 xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); 360 /* Compute the hi 32 bytes */ 361 xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); 362 } 363 /* Shuffle 2-byte words */ 364 for (j = 0; j < 4; j++) { 365 /* Compute the low 32 bytes */ 366 xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); 367 /* Compute the hi 32 bytes */ 368 xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); 369 } 370 /* Shuffle 4-byte dwords */ 371 for (j = 0; j < 4; j++) { 372 /* Compute the low 32 bytes */ 373 xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]); 374 /* Compute the hi 32 bytes */ 375 xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]); 376 } 377 /* Store the result vectors in proper order */ 378 ((__m128i *)dest)[k+0] = xmm1[0]; 379 ((__m128i *)dest)[k+1] = xmm1[4]; 380 ((__m128i *)dest)[k+2] = xmm1[2]; 381 ((__m128i *)dest)[k+3] = xmm1[6]; 382 ((__m128i *)dest)[k+4] = xmm1[1]; 383 ((__m128i *)dest)[k+5] = xmm1[5]; 384 ((__m128i *)dest)[k+6] = xmm1[3]; 385 ((__m128i *)dest)[k+7] = xmm1[7]; 386 } 387 } 388 389 390 /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ 391 static void 392 unshuffle16(uint8_t* dest, uint8_t* orig, size_t size) 393 { 394 size_t i, j, k; 395 size_t neblock, numof16belem; 396 __m128i xmm1[16], xmm2[16]; 397 398 neblock = size / 16; 399 numof16belem = neblock / 16; 400 for (i = 0, k = 0; i < numof16belem; i++, k += 16) { 401 /* Load the first 128 bytes in 16 XMM registrers */ 402 for (j = 0; j < 16; j++) { 403 xmm1[j] = ((__m128i *)orig)[j*numof16belem+i]; 404 } 405 /* Shuffle bytes */ 406 for (j = 0; j < 8; j++) { 407 /* Compute the low 32 bytes */ 408 xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); 409 /* Compute the hi 32 bytes */ 410 xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); 411 } 412 /* Shuffle 2-byte words */ 413 for (j = 0; j < 8; j++) { 414 /* Compute the low 32 bytes */ 415 xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); 416 /* Compute the hi 32 bytes */ 417 xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); 418 } 419 /* Shuffle 4-byte dwords */ 420 for (j = 0; j < 8; j++) { 421 /* Compute the low 32 bytes */ 422 xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); 423 /* Compute the hi 32 bytes */ 424 xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); 425 } 426 /* Shuffle 8-byte qwords */ 427 for (j = 0; j < 8; j++) { 428 /* Compute the low 32 bytes */ 429 xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); 430 /* Compute the hi 32 bytes */ 431 xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); 432 } 433 /* Store the result vectors in proper order */ 434 ((__m128i *)dest)[k+0] = xmm1[0]; 435 ((__m128i *)dest)[k+1] = xmm1[8]; 436 ((__m128i *)dest)[k+2] = xmm1[4]; 437 ((__m128i *)dest)[k+3] = xmm1[12]; 438 ((__m128i *)dest)[k+4] = xmm1[2]; 439 ((__m128i *)dest)[k+5] = xmm1[10]; 440 ((__m128i *)dest)[k+6] = xmm1[6]; 441 ((__m128i *)dest)[k+7] = xmm1[14]; 442 ((__m128i *)dest)[k+8] = xmm1[1]; 443 ((__m128i *)dest)[k+9] = xmm1[9]; 444 ((__m128i *)dest)[k+10] = xmm1[5]; 445 ((__m128i *)dest)[k+11] = xmm1[13]; 446 ((__m128i *)dest)[k+12] = xmm1[3]; 447 ((__m128i *)dest)[k+13] = xmm1[11]; 448 ((__m128i *)dest)[k+14] = xmm1[7]; 449 ((__m128i *)dest)[k+15] = xmm1[15]; 450 } 451 } 452 453 454 /* Unshuffle a block. This can never fail. */ 455 void unshuffle(size_t bytesoftype, size_t blocksize, 456 uint8_t* _src, uint8_t* _dest) { 457 int unaligned_src = (int)((uintptr_t)_src % 16); 458 int unaligned_dest = (int)((uintptr_t)_dest % 16); 459 int power_of_two = (blocksize & (blocksize - 1)) == 0; 460 int too_small = (blocksize < 256); 461 462 if (unaligned_src || unaligned_dest || !power_of_two || too_small) { 463 /* _src or _dest buffer is not aligned, not a power of two or is 464 too small. Call the non-sse2 version. */ 465 _unshuffle(bytesoftype, blocksize, _src, _dest); 466 return; 467 } 468 469 /* Optimized unshuffle */ 470 /* The buffers must be aligned on a 16 bytes boundary, have a power */ 471 /* of 2 size and be larger or equal than 256 bytes. */ 472 if (bytesoftype == 4) { 473 unshuffle4(_dest, _src, blocksize); 474 } 475 else if (bytesoftype == 8) { 476 unshuffle8(_dest, _src, blocksize); 477 } 478 else if (bytesoftype == 16) { 479 unshuffle16(_dest, _src, blocksize); 480 } 481 else if (bytesoftype == 2) { 482 unshuffle2(_dest, _src, blocksize); 483 } 484 else { 485 /* Non-optimized unshuffle */ 486 _unshuffle(bytesoftype, blocksize, _src, _dest); 487 } 488 } 489 490 #else /* no __SSE2__ available */ 491 492 void shuffle(size_t bytesoftype, size_t blocksize, 493 uint8_t* _src, uint8_t* _dest) { 494 _shuffle(bytesoftype, blocksize, _src, _dest); 495 } 496 497 void unshuffle(size_t bytesoftype, size_t blocksize, 498 uint8_t* _src, uint8_t* _dest) { 499 _unshuffle(bytesoftype, blocksize, _src, _dest); 500 } 501 502 #endif /* __SSE2__ */ 115 return ((uint64_t)xcr1 << 32) | xcr0; 116 } 117 118 #elif defined(_M_X64) 119 120 /* Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets. 121 These compilers don't support any of the newer acceleration ISAs 122 (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2 123 which means we can get away with returning a hard-coded value from 124 this implementation of _xgetbv. */ 125 126 static inline uint64_t 127 _xgetbv(uint32_t xcr) { 128 /* A 64-bit OS must have XMM save support. */ 129 return xcr == 0 ? (1UL << 1) : 0UL; 130 } 131 132 #else 133 134 /* Hardware detection for any other MSVC targets (e.g., ARM) 135 isn't implemented at this time. */ 136 #error This version of c-blosc only supports x86 and x64 targets with MSVC. 137 138 #endif /* _MSC_FULL_VER >= 160040219 */ 139 140 #else 141 142 /* Implement the __cpuid and __cpuidex intrinsics for GCC, Clang, 143 and others using inline assembly. */ 144 __attribute__((always_inline)) 145 static inline void 146 __cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) { 147 __asm__ __volatile__ ( 148 # if defined(__i386__) && defined (__PIC__) 149 /* Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored. 150 https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family 151 */ 152 "movl %%ebx, %%edi\n\t" 153 "cpuid\n\t" 154 "xchgl %%ebx, %%edi": 155 "=D" (cpuInfo[1]), 156 #else 157 "cpuid": 158 "=b" (cpuInfo[1]), 159 #endif /* defined(__i386) && defined(__PIC__) */ 160 "=a" (cpuInfo[0]), 161 "=c" (cpuInfo[2]), 162 "=d" (cpuInfo[3]) : 163 "a" (function_id), "c" (subfunction_id) 164 ); 165 } 166 167 #define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0) 168 169 #define _XCR_XFEATURE_ENABLED_MASK 0 170 171 /* Reads the content of an extended control register. 172 https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family 173 */ 174 static inline uint64_t 175 _xgetbv(uint32_t xcr) { 176 uint32_t eax, edx; 177 __asm__ __volatile__ ( 178 /* "xgetbv" 179 This is specified as raw instruction bytes due to some older compilers 180 having issues with the mnemonic form. 181 */ 182 ".byte 0x0f, 0x01, 0xd0": 183 "=a" (eax), 184 "=d" (edx) : 185 "c" (xcr) 186 ); 187 return ((uint64_t)edx << 32) | eax; 188 } 189 190 #endif /* defined(_MSC_FULL_VER) */ 191 192 #ifndef _XCR_XFEATURE_ENABLED_MASK 193 #define _XCR_XFEATURE_ENABLED_MASK 0x0 194 #endif 195 196 static blosc_cpu_features blosc_get_cpu_features(void) { 197 blosc_cpu_features result = BLOSC_HAVE_NOTHING; 198 int32_t max_basic_function_id; 199 /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */ 200 int32_t cpu_info[4]; 201 int sse2_available; 202 int sse3_available; 203 int ssse3_available; 204 int sse41_available; 205 int sse42_available; 206 int xsave_available; 207 int xsave_enabled_by_os; 208 int avx2_available = 0; 209 int avx512bw_available = 0; 210 int xmm_state_enabled = 0; 211 int ymm_state_enabled = 0; 212 int zmm_state_enabled = 0; 213 uint64_t xcr0_contents; 214 215 /* Get the number of basic functions available. */ 216 __cpuid(cpu_info, 0); 217 max_basic_function_id = cpu_info[0]; 218 219 /* Check for SSE-based features and required OS support */ 220 __cpuid(cpu_info, 1); 221 sse2_available = (cpu_info[3] & (1 << 26)) != 0; 222 sse3_available = (cpu_info[2] & (1 << 0)) != 0; 223 ssse3_available = (cpu_info[2] & (1 << 9)) != 0; 224 sse41_available = (cpu_info[2] & (1 << 19)) != 0; 225 sse42_available = (cpu_info[2] & (1 << 20)) != 0; 226 227 xsave_available = (cpu_info[2] & (1 << 26)) != 0; 228 xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0; 229 230 /* Check for AVX-based features, if the processor supports extended features. */ 231 if (max_basic_function_id >= 7) { 232 __cpuid(cpu_info, 7); 233 avx2_available = (cpu_info[1] & (1 << 5)) != 0; 234 avx512bw_available = (cpu_info[1] & (1 << 30)) != 0; 235 } 236 237 /* Even if certain features are supported by the CPU, they may not be supported 238 by the OS (in which case using them would crash the process or system). 239 If xsave is available and enabled by the OS, check the contents of the 240 extended control register XCR0 to see if the CPU features are enabled. */ 241 #if defined(_XCR_XFEATURE_ENABLED_MASK) 242 if (xsave_available && xsave_enabled_by_os && ( 243 sse2_available || sse3_available || ssse3_available 244 || sse41_available || sse42_available 245 || avx2_available || avx512bw_available)) { 246 /* Determine which register states can be restored by the OS. */ 247 xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); 248 249 xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0; 250 ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0; 251 252 /* Require support for both the upper 256-bits of zmm0-zmm15 to be 253 restored as well as all of zmm16-zmm31 and the opmask registers. */ 254 zmm_state_enabled = (xcr0_contents & 0x70) == 0x70; 255 } 256 #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */ 257 258 #if defined(BLOSC_DUMP_CPU_INFO) 259 printf("Shuffle CPU Information:\n"); 260 printf("SSE2 available: %s\n", sse2_available ? "True" : "False"); 261 printf("SSE3 available: %s\n", sse3_available ? "True" : "False"); 262 printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False"); 263 printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False"); 264 printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False"); 265 printf("AVX2 available: %s\n", avx2_available ? "True" : "False"); 266 printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False"); 267 printf("XSAVE available: %s\n", xsave_available ? "True" : "False"); 268 printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False"); 269 printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False"); 270 printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False"); 271 printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False"); 272 #endif /* defined(BLOSC_DUMP_CPU_INFO) */ 273 274 /* Using the gathered CPU information, determine which implementation to use. */ 275 /* technically could fail on sse2 cpu on os without xmm support, but that 276 * shouldn't exist anymore */ 277 if (sse2_available) { 278 result |= BLOSC_HAVE_SSE2; 279 } 280 if (xmm_state_enabled && ymm_state_enabled && avx2_available) { 281 result |= BLOSC_HAVE_AVX2; 282 } 283 return result; 284 } 285 #endif 286 287 #else /* No hardware acceleration supported for the target architecture. */ 288 #if defined(_MSC_VER) 289 #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.") 290 #else 291 #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available. 292 #endif 293 294 static blosc_cpu_features blosc_get_cpu_features(void) { 295 return BLOSC_HAVE_NOTHING; 296 } 297 298 #endif 299 300 static shuffle_implementation_t get_shuffle_implementation() { 301 blosc_cpu_features cpu_features = blosc_get_cpu_features(); 302 shuffle_implementation_t impl_generic; 303 304 #if defined(SHUFFLE_AVX2_ENABLED) 305 if (cpu_features & BLOSC_HAVE_AVX2) { 306 shuffle_implementation_t impl_avx2; 307 impl_avx2.name = "avx2"; 308 impl_avx2.shuffle = (shuffle_func)shuffle_avx2; 309 impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2; 310 impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2; 311 impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2; 312 return impl_avx2; 313 } 314 #endif /* defined(SHUFFLE_AVX2_ENABLED) */ 315 316 #if defined(SHUFFLE_SSE2_ENABLED) 317 if (cpu_features & BLOSC_HAVE_SSE2) { 318 shuffle_implementation_t impl_sse2; 319 impl_sse2.name = "sse2"; 320 impl_sse2.shuffle = (shuffle_func)shuffle_sse2; 321 impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2; 322 impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2; 323 impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2; 324 return impl_sse2; 325 } 326 #endif /* defined(SHUFFLE_SSE2_ENABLED) */ 327 328 /* Processor doesn't support any of the hardware-accelerated implementations, 329 so use the generic implementation. */ 330 impl_generic.name = "generic"; 331 impl_generic.shuffle = (shuffle_func)shuffle_generic; 332 impl_generic.unshuffle = (unshuffle_func)unshuffle_generic; 333 impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal; 334 impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal; 335 return impl_generic; 336 } 337 338 339 /* Flag indicating whether the implementation has been initialized. 340 Zero means it hasn't been initialized, non-zero means it has. */ 341 static int32_t implementation_initialized; 342 343 /* The dynamically-chosen shuffle/unshuffle implementation. 344 This is only safe to use once `implementation_initialized` is set. */ 345 static shuffle_implementation_t host_implementation; 346 347 /* Initialize the shuffle implementation, if necessary. */ 348 #if defined(__GNUC__) || defined(__clang__) 349 __attribute__((always_inline)) 350 #endif 351 static 352 #if defined(_MSC_VER) 353 __forceinline 354 #else 355 inline 356 #endif 357 void init_shuffle_implementation() { 358 /* Initialization could (in rare cases) take place concurrently on 359 multiple threads, but it shouldn't matter because the 360 initialization should return the same result on each thread (so 361 the implementation will be the same). Since that's the case we 362 can avoid complicated synchronization here and get a small 363 performance benefit because we don't need to perform a volatile 364 load on the initialization variable each time this function is 365 called. */ 366 #if defined(__GNUC__) || defined(__clang__) 367 if (__builtin_expect(!implementation_initialized, 0)) { 368 #else 369 if (!implementation_initialized) { 370 #endif 371 /* Initialize the implementation. */ 372 host_implementation = get_shuffle_implementation(); 373 374 /* Set the flag indicating the implementation has been initialized. */ 375 implementation_initialized = 1; 376 } 377 } 378 379 /* Shuffle a block by dynamically dispatching to the appropriate 380 hardware-accelerated routine at run-time. */ 381 void 382 shuffle(const size_t bytesoftype, const size_t blocksize, 383 const uint8_t* _src, const uint8_t* _dest) { 384 /* Initialize the shuffle implementation if necessary. */ 385 init_shuffle_implementation(); 386 387 /* The implementation is initialized. 388 Dispatch to it's shuffle routine. */ 389 (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest); 390 } 391 392 /* Unshuffle a block by dynamically dispatching to the appropriate 393 hardware-accelerated routine at run-time. */ 394 void 395 unshuffle(const size_t bytesoftype, const size_t blocksize, 396 const uint8_t* _src, const uint8_t* _dest) { 397 /* Initialize the shuffle implementation if necessary. */ 398 init_shuffle_implementation(); 399 400 /* The implementation is initialized. 401 Dispatch to it's unshuffle routine. */ 402 (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest); 403 } 404 405 /* Bit-shuffle a block by dynamically dispatching to the appropriate 406 hardware-accelerated routine at run-time. */ 407 int 408 bitshuffle(const size_t bytesoftype, const size_t blocksize, 409 const uint8_t* const _src, const uint8_t* _dest, 410 const uint8_t* _tmp) { 411 int size = blocksize / bytesoftype; 412 /* Initialize the shuffle implementation if necessary. */ 413 init_shuffle_implementation(); 414 415 if ((size % 8) == 0) 416 /* The number of elems is a multiple of 8 which is supported by 417 bitshuffle. */ 418 return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest, 419 blocksize / bytesoftype, 420 bytesoftype, (void*)_tmp); 421 else 422 memcpy((void*)_dest, (void*)_src, blocksize); 423 return size; 424 } 425 426 /* Bit-unshuffle a block by dynamically dispatching to the appropriate 427 hardware-accelerated routine at run-time. */ 428 int 429 bitunshuffle(const size_t bytesoftype, const size_t blocksize, 430 const uint8_t* const _src, const uint8_t* _dest, 431 const uint8_t* _tmp) { 432 int size = blocksize / bytesoftype; 433 /* Initialize the shuffle implementation if necessary. */ 434 init_shuffle_implementation(); 435 436 if ((size % 8) == 0) 437 /* The number of elems is a multiple of 8 which is supported by 438 bitshuffle. */ 439 return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest, 440 blocksize / bytesoftype, 441 bytesoftype, (void*)_tmp); 442 else 443 memcpy((void*)_dest, (void*)_src, blocksize); 444 return size; 445 }
Note: See TracChangeset
for help on using the changeset viewer.