[00587dc] | 1 | /********************************************************************* |
---|
[981e22c] | 2 | Blosc - Blocked Shuffling and Compression Library |
---|
[00587dc] | 3 | |
---|
[981e22c] | 4 | Author: Francesc Alted <[email protected]> |
---|
[00587dc] | 5 | Creation date: 2009-05-20 |
---|
| 6 | |
---|
| 7 | See LICENSES/BLOSC.txt for details about copyright and rights to use. |
---|
| 8 | **********************************************************************/ |
---|
| 9 | |
---|
[981e22c] | 10 | #include "shuffle.h" |
---|
| 11 | #include "shuffle-common.h" |
---|
| 12 | #include "shuffle-generic.h" |
---|
| 13 | #include "bitshuffle-generic.h" |
---|
[00587dc] | 14 | #include <stdio.h> |
---|
| 15 | #include <string.h> |
---|
| 16 | |
---|
[981e22c] | 17 | /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */ |
---|
| 18 | #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L |
---|
| 19 | /* have a C99 compiler */ |
---|
| 20 | typedef _Bool bool; |
---|
[00587dc] | 21 | #else |
---|
[981e22c] | 22 | /* do not have a C99 compiler */ |
---|
| 23 | typedef unsigned char bool; |
---|
| 24 | #endif |
---|
| 25 | static const bool false = 0; |
---|
| 26 | static const bool true = 1; |
---|
[00587dc] | 27 | |
---|
| 28 | |
---|
[981e22c] | 29 | #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ |
---|
| 30 | __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) |
---|
| 31 | #define HAVE_CPU_FEAT_INTRIN |
---|
| 32 | #endif |
---|
[00587dc] | 33 | |
---|
[981e22c] | 34 | /* Include hardware-accelerated shuffle/unshuffle routines based on |
---|
| 35 | the target architecture. Note that a target architecture may support |
---|
| 36 | more than one type of acceleration!*/ |
---|
| 37 | #if defined(SHUFFLE_AVX2_ENABLED) |
---|
| 38 | #include "shuffle-avx2.h" |
---|
| 39 | #include "bitshuffle-avx2.h" |
---|
| 40 | #endif /* defined(SHUFFLE_AVX2_ENABLED) */ |
---|
| 41 | |
---|
| 42 | #if defined(SHUFFLE_SSE2_ENABLED) |
---|
| 43 | #include "shuffle-sse2.h" |
---|
| 44 | #include "bitshuffle-sse2.h" |
---|
| 45 | #endif /* defined(SHUFFLE_SSE2_ENABLED) */ |
---|
| 46 | |
---|
| 47 | |
---|
| 48 | /* Define function pointer types for shuffle/unshuffle routines. */ |
---|
| 49 | typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*); |
---|
| 50 | typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*); |
---|
| 51 | typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*); |
---|
| 52 | typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*); |
---|
| 53 | |
---|
| 54 | /* An implementation of shuffle/unshuffle routines. */ |
---|
| 55 | typedef struct shuffle_implementation { |
---|
| 56 | /* Name of this implementation. */ |
---|
| 57 | const char* name; |
---|
| 58 | /* Function pointer to the shuffle routine for this implementation. */ |
---|
| 59 | shuffle_func shuffle; |
---|
| 60 | /* Function pointer to the unshuffle routine for this implementation. */ |
---|
| 61 | unshuffle_func unshuffle; |
---|
| 62 | /* Function pointer to the bitshuffle routine for this implementation. */ |
---|
| 63 | bitshuffle_func bitshuffle; |
---|
| 64 | /* Function pointer to the bitunshuffle routine for this implementation. */ |
---|
| 65 | bitunshuffle_func bitunshuffle; |
---|
| 66 | } shuffle_implementation_t; |
---|
| 67 | |
---|
| 68 | typedef enum { |
---|
| 69 | BLOSC_HAVE_NOTHING = 0, |
---|
| 70 | BLOSC_HAVE_SSE2 = 1, |
---|
| 71 | BLOSC_HAVE_AVX2 = 2 |
---|
| 72 | } blosc_cpu_features; |
---|
| 73 | |
---|
| 74 | /* Detect hardware and set function pointers to the best shuffle/unshuffle |
---|
| 75 | implementations supported by the host processor. */ |
---|
| 76 | #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) /* Intel/i686 */ |
---|
| 77 | |
---|
| 78 | /* Disabled the __builtin_cpu_supports() call, as it has issues with |
---|
| 79 | new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial: |
---|
| 80 | "undefined symbol: __cpu_model" |
---|
| 81 | For a similar report, see: |
---|
| 82 | https://lists.fedoraproject.org/archives/list/[email protected]/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/ |
---|
| 83 | */ |
---|
| 84 | #if defined(HAVE_CPU_FEAT_INTRIN) && 0 |
---|
| 85 | static blosc_cpu_features blosc_get_cpu_features(void) { |
---|
| 86 | blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING; |
---|
| 87 | if (__builtin_cpu_supports("sse2")) { |
---|
| 88 | cpu_features |= BLOSC_HAVE_SSE2; |
---|
[00587dc] | 89 | } |
---|
[981e22c] | 90 | if (__builtin_cpu_supports("avx2")) { |
---|
| 91 | cpu_features |= BLOSC_HAVE_AVX2; |
---|
[00587dc] | 92 | } |
---|
[981e22c] | 93 | return cpu_features; |
---|
[00587dc] | 94 | } |
---|
[981e22c] | 95 | #else |
---|
[00587dc] | 96 | |
---|
[981e22c] | 97 | #if defined(_MSC_VER) && !defined(__clang__) |
---|
| 98 | #include <intrin.h> /* Needed for __cpuid */ |
---|
[00587dc] | 99 | |
---|
[981e22c] | 100 | /* _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */ |
---|
| 101 | #if _MSC_FULL_VER >= 160040219 |
---|
| 102 | #include <immintrin.h> /* Needed for _xgetbv */ |
---|
| 103 | #elif defined(_M_IX86) |
---|
[00587dc] | 104 | |
---|
[981e22c] | 105 | /* Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */ |
---|
[00587dc] | 106 | |
---|
[981e22c] | 107 | static uint64_t _xgetbv(uint32_t xcr) { |
---|
| 108 | uint32_t xcr0, xcr1; |
---|
| 109 | __asm { |
---|
| 110 | mov ecx, xcr |
---|
| 111 | _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 |
---|
| 112 | mov xcr0, eax |
---|
| 113 | mov xcr1, edx |
---|
[00587dc] | 114 | } |
---|
[981e22c] | 115 | return ((uint64_t)xcr1 << 32) | xcr0; |
---|
[00587dc] | 116 | } |
---|
| 117 | |
---|
[981e22c] | 118 | #elif defined(_M_X64) |
---|
[00587dc] | 119 | |
---|
[981e22c] | 120 | /* Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets. |
---|
| 121 | These compilers don't support any of the newer acceleration ISAs |
---|
| 122 | (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2 |
---|
| 123 | which means we can get away with returning a hard-coded value from |
---|
| 124 | this implementation of _xgetbv. */ |
---|
| 125 | |
---|
| 126 | static inline uint64_t |
---|
| 127 | _xgetbv(uint32_t xcr) { |
---|
| 128 | /* A 64-bit OS must have XMM save support. */ |
---|
| 129 | return xcr == 0 ? (1UL << 1) : 0UL; |
---|
[00587dc] | 130 | } |
---|
| 131 | |
---|
[981e22c] | 132 | #else |
---|
[00587dc] | 133 | |
---|
[981e22c] | 134 | /* Hardware detection for any other MSVC targets (e.g., ARM) |
---|
| 135 | isn't implemented at this time. */ |
---|
| 136 | #error This version of c-blosc only supports x86 and x64 targets with MSVC. |
---|
[00587dc] | 137 | |
---|
[981e22c] | 138 | #endif /* _MSC_FULL_VER >= 160040219 */ |
---|
| 139 | |
---|
| 140 | #else |
---|
[00587dc] | 141 | |
---|
[981e22c] | 142 | /* Implement the __cpuid and __cpuidex intrinsics for GCC, Clang, |
---|
| 143 | and others using inline assembly. */ |
---|
| 144 | __attribute__((always_inline)) |
---|
| 145 | static inline void |
---|
| 146 | __cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) { |
---|
| 147 | __asm__ __volatile__ ( |
---|
| 148 | # if defined(__i386__) && defined (__PIC__) |
---|
| 149 | /* Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored. |
---|
| 150 | https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family |
---|
| 151 | */ |
---|
| 152 | "movl %%ebx, %%edi\n\t" |
---|
| 153 | "cpuid\n\t" |
---|
| 154 | "xchgl %%ebx, %%edi": |
---|
| 155 | "=D" (cpuInfo[1]), |
---|
| 156 | #else |
---|
| 157 | "cpuid": |
---|
| 158 | "=b" (cpuInfo[1]), |
---|
| 159 | #endif /* defined(__i386) && defined(__PIC__) */ |
---|
| 160 | "=a" (cpuInfo[0]), |
---|
| 161 | "=c" (cpuInfo[2]), |
---|
| 162 | "=d" (cpuInfo[3]) : |
---|
| 163 | "a" (function_id), "c" (subfunction_id) |
---|
| 164 | ); |
---|
| 165 | } |
---|
[00587dc] | 166 | |
---|
[981e22c] | 167 | #define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0) |
---|
| 168 | |
---|
| 169 | #define _XCR_XFEATURE_ENABLED_MASK 0 |
---|
| 170 | |
---|
| 171 | /* Reads the content of an extended control register. |
---|
| 172 | https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family |
---|
| 173 | */ |
---|
| 174 | static inline uint64_t |
---|
| 175 | _xgetbv(uint32_t xcr) { |
---|
| 176 | uint32_t eax, edx; |
---|
| 177 | __asm__ __volatile__ ( |
---|
| 178 | /* "xgetbv" |
---|
| 179 | This is specified as raw instruction bytes due to some older compilers |
---|
| 180 | having issues with the mnemonic form. |
---|
| 181 | */ |
---|
| 182 | ".byte 0x0f, 0x01, 0xd0": |
---|
| 183 | "=a" (eax), |
---|
| 184 | "=d" (edx) : |
---|
| 185 | "c" (xcr) |
---|
| 186 | ); |
---|
| 187 | return ((uint64_t)edx << 32) | eax; |
---|
[00587dc] | 188 | } |
---|
| 189 | |
---|
[981e22c] | 190 | #endif /* defined(_MSC_FULL_VER) */ |
---|
[00587dc] | 191 | |
---|
[981e22c] | 192 | #ifndef _XCR_XFEATURE_ENABLED_MASK |
---|
| 193 | #define _XCR_XFEATURE_ENABLED_MASK 0x0 |
---|
| 194 | #endif |
---|
[00587dc] | 195 | |
---|
[981e22c] | 196 | static blosc_cpu_features blosc_get_cpu_features(void) { |
---|
| 197 | blosc_cpu_features result = BLOSC_HAVE_NOTHING; |
---|
| 198 | int32_t max_basic_function_id; |
---|
| 199 | /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */ |
---|
| 200 | int32_t cpu_info[4]; |
---|
| 201 | int sse2_available; |
---|
| 202 | int sse3_available; |
---|
| 203 | int ssse3_available; |
---|
| 204 | int sse41_available; |
---|
| 205 | int sse42_available; |
---|
| 206 | int xsave_available; |
---|
| 207 | int xsave_enabled_by_os; |
---|
| 208 | int avx2_available = 0; |
---|
| 209 | int avx512bw_available = 0; |
---|
| 210 | int xmm_state_enabled = 0; |
---|
| 211 | int ymm_state_enabled = 0; |
---|
| 212 | int zmm_state_enabled = 0; |
---|
| 213 | uint64_t xcr0_contents; |
---|
| 214 | |
---|
| 215 | /* Get the number of basic functions available. */ |
---|
| 216 | __cpuid(cpu_info, 0); |
---|
| 217 | max_basic_function_id = cpu_info[0]; |
---|
| 218 | |
---|
| 219 | /* Check for SSE-based features and required OS support */ |
---|
| 220 | __cpuid(cpu_info, 1); |
---|
| 221 | sse2_available = (cpu_info[3] & (1 << 26)) != 0; |
---|
| 222 | sse3_available = (cpu_info[2] & (1 << 0)) != 0; |
---|
| 223 | ssse3_available = (cpu_info[2] & (1 << 9)) != 0; |
---|
| 224 | sse41_available = (cpu_info[2] & (1 << 19)) != 0; |
---|
| 225 | sse42_available = (cpu_info[2] & (1 << 20)) != 0; |
---|
| 226 | |
---|
| 227 | xsave_available = (cpu_info[2] & (1 << 26)) != 0; |
---|
| 228 | xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0; |
---|
| 229 | |
---|
| 230 | /* Check for AVX-based features, if the processor supports extended features. */ |
---|
| 231 | if (max_basic_function_id >= 7) { |
---|
| 232 | __cpuid(cpu_info, 7); |
---|
| 233 | avx2_available = (cpu_info[1] & (1 << 5)) != 0; |
---|
| 234 | avx512bw_available = (cpu_info[1] & (1 << 30)) != 0; |
---|
[00587dc] | 235 | } |
---|
| 236 | |
---|
[981e22c] | 237 | /* Even if certain features are supported by the CPU, they may not be supported |
---|
| 238 | by the OS (in which case using them would crash the process or system). |
---|
| 239 | If xsave is available and enabled by the OS, check the contents of the |
---|
| 240 | extended control register XCR0 to see if the CPU features are enabled. */ |
---|
| 241 | #if defined(_XCR_XFEATURE_ENABLED_MASK) |
---|
| 242 | if (xsave_available && xsave_enabled_by_os && ( |
---|
| 243 | sse2_available || sse3_available || ssse3_available |
---|
| 244 | || sse41_available || sse42_available |
---|
| 245 | || avx2_available || avx512bw_available)) { |
---|
| 246 | /* Determine which register states can be restored by the OS. */ |
---|
| 247 | xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); |
---|
| 248 | |
---|
| 249 | xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0; |
---|
| 250 | ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0; |
---|
| 251 | |
---|
| 252 | /* Require support for both the upper 256-bits of zmm0-zmm15 to be |
---|
| 253 | restored as well as all of zmm16-zmm31 and the opmask registers. */ |
---|
| 254 | zmm_state_enabled = (xcr0_contents & 0x70) == 0x70; |
---|
[00587dc] | 255 | } |
---|
[981e22c] | 256 | #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */ |
---|
| 257 | |
---|
| 258 | #if defined(BLOSC_DUMP_CPU_INFO) |
---|
| 259 | printf("Shuffle CPU Information:\n"); |
---|
| 260 | printf("SSE2 available: %s\n", sse2_available ? "True" : "False"); |
---|
| 261 | printf("SSE3 available: %s\n", sse3_available ? "True" : "False"); |
---|
| 262 | printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False"); |
---|
| 263 | printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False"); |
---|
| 264 | printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False"); |
---|
| 265 | printf("AVX2 available: %s\n", avx2_available ? "True" : "False"); |
---|
| 266 | printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False"); |
---|
| 267 | printf("XSAVE available: %s\n", xsave_available ? "True" : "False"); |
---|
| 268 | printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False"); |
---|
| 269 | printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False"); |
---|
| 270 | printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False"); |
---|
| 271 | printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False"); |
---|
| 272 | #endif /* defined(BLOSC_DUMP_CPU_INFO) */ |
---|
| 273 | |
---|
| 274 | /* Using the gathered CPU information, determine which implementation to use. */ |
---|
| 275 | /* technically could fail on sse2 cpu on os without xmm support, but that |
---|
| 276 | * shouldn't exist anymore */ |
---|
| 277 | if (sse2_available) { |
---|
| 278 | result |= BLOSC_HAVE_SSE2; |
---|
[00587dc] | 279 | } |
---|
[981e22c] | 280 | if (xmm_state_enabled && ymm_state_enabled && avx2_available) { |
---|
| 281 | result |= BLOSC_HAVE_AVX2; |
---|
[00587dc] | 282 | } |
---|
[981e22c] | 283 | return result; |
---|
[00587dc] | 284 | } |
---|
[981e22c] | 285 | #endif |
---|
[00587dc] | 286 | |
---|
[981e22c] | 287 | #else /* No hardware acceleration supported for the target architecture. */ |
---|
| 288 | #if defined(_MSC_VER) |
---|
| 289 | #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.") |
---|
| 290 | #else |
---|
| 291 | #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available. |
---|
| 292 | #endif |
---|
[00587dc] | 293 | |
---|
[981e22c] | 294 | static blosc_cpu_features blosc_get_cpu_features(void) { |
---|
| 295 | return BLOSC_HAVE_NOTHING; |
---|
[00587dc] | 296 | } |
---|
| 297 | |
---|
[981e22c] | 298 | #endif |
---|
[00587dc] | 299 | |
---|
[981e22c] | 300 | static shuffle_implementation_t get_shuffle_implementation() { |
---|
| 301 | blosc_cpu_features cpu_features = blosc_get_cpu_features(); |
---|
| 302 | shuffle_implementation_t impl_generic; |
---|
| 303 | |
---|
| 304 | #if defined(SHUFFLE_AVX2_ENABLED) |
---|
| 305 | if (cpu_features & BLOSC_HAVE_AVX2) { |
---|
| 306 | shuffle_implementation_t impl_avx2; |
---|
| 307 | impl_avx2.name = "avx2"; |
---|
| 308 | impl_avx2.shuffle = (shuffle_func)shuffle_avx2; |
---|
| 309 | impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2; |
---|
| 310 | impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2; |
---|
| 311 | impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2; |
---|
| 312 | return impl_avx2; |
---|
[00587dc] | 313 | } |
---|
[981e22c] | 314 | #endif /* defined(SHUFFLE_AVX2_ENABLED) */ |
---|
| 315 | |
---|
| 316 | #if defined(SHUFFLE_SSE2_ENABLED) |
---|
| 317 | if (cpu_features & BLOSC_HAVE_SSE2) { |
---|
| 318 | shuffle_implementation_t impl_sse2; |
---|
| 319 | impl_sse2.name = "sse2"; |
---|
| 320 | impl_sse2.shuffle = (shuffle_func)shuffle_sse2; |
---|
| 321 | impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2; |
---|
| 322 | impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2; |
---|
| 323 | impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2; |
---|
| 324 | return impl_sse2; |
---|
[00587dc] | 325 | } |
---|
[981e22c] | 326 | #endif /* defined(SHUFFLE_SSE2_ENABLED) */ |
---|
| 327 | |
---|
| 328 | /* Processor doesn't support any of the hardware-accelerated implementations, |
---|
| 329 | so use the generic implementation. */ |
---|
| 330 | impl_generic.name = "generic"; |
---|
| 331 | impl_generic.shuffle = (shuffle_func)shuffle_generic; |
---|
| 332 | impl_generic.unshuffle = (unshuffle_func)unshuffle_generic; |
---|
| 333 | impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal; |
---|
| 334 | impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal; |
---|
| 335 | return impl_generic; |
---|
[00587dc] | 336 | } |
---|
| 337 | |
---|
| 338 | |
---|
[981e22c] | 339 | /* Flag indicating whether the implementation has been initialized. |
---|
| 340 | Zero means it hasn't been initialized, non-zero means it has. */ |
---|
| 341 | static int32_t implementation_initialized; |
---|
[00587dc] | 342 | |
---|
[981e22c] | 343 | /* The dynamically-chosen shuffle/unshuffle implementation. |
---|
| 344 | This is only safe to use once `implementation_initialized` is set. */ |
---|
| 345 | static shuffle_implementation_t host_implementation; |
---|
[00587dc] | 346 | |
---|
[981e22c] | 347 | /* Initialize the shuffle implementation, if necessary. */ |
---|
| 348 | #if defined(__GNUC__) || defined(__clang__) |
---|
| 349 | __attribute__((always_inline)) |
---|
| 350 | #endif |
---|
| 351 | static |
---|
| 352 | #if defined(_MSC_VER) |
---|
| 353 | __forceinline |
---|
| 354 | #else |
---|
| 355 | inline |
---|
| 356 | #endif |
---|
| 357 | void init_shuffle_implementation() { |
---|
| 358 | /* Initialization could (in rare cases) take place concurrently on |
---|
| 359 | multiple threads, but it shouldn't matter because the |
---|
| 360 | initialization should return the same result on each thread (so |
---|
| 361 | the implementation will be the same). Since that's the case we |
---|
| 362 | can avoid complicated synchronization here and get a small |
---|
| 363 | performance benefit because we don't need to perform a volatile |
---|
| 364 | load on the initialization variable each time this function is |
---|
| 365 | called. */ |
---|
| 366 | #if defined(__GNUC__) || defined(__clang__) |
---|
| 367 | if (__builtin_expect(!implementation_initialized, 0)) { |
---|
| 368 | #else |
---|
| 369 | if (!implementation_initialized) { |
---|
| 370 | #endif |
---|
| 371 | /* Initialize the implementation. */ |
---|
| 372 | host_implementation = get_shuffle_implementation(); |
---|
[00587dc] | 373 | |
---|
[981e22c] | 374 | /* Set the flag indicating the implementation has been initialized. */ |
---|
| 375 | implementation_initialized = 1; |
---|
[00587dc] | 376 | } |
---|
| 377 | } |
---|
| 378 | |
---|
[981e22c] | 379 | /* Shuffle a block by dynamically dispatching to the appropriate |
---|
| 380 | hardware-accelerated routine at run-time. */ |
---|
| 381 | void |
---|
| 382 | shuffle(const size_t bytesoftype, const size_t blocksize, |
---|
| 383 | const uint8_t* _src, const uint8_t* _dest) { |
---|
| 384 | /* Initialize the shuffle implementation if necessary. */ |
---|
| 385 | init_shuffle_implementation(); |
---|
| 386 | |
---|
| 387 | /* The implementation is initialized. |
---|
| 388 | Dispatch to it's shuffle routine. */ |
---|
| 389 | (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest); |
---|
| 390 | } |
---|
[00587dc] | 391 | |
---|
[981e22c] | 392 | /* Unshuffle a block by dynamically dispatching to the appropriate |
---|
| 393 | hardware-accelerated routine at run-time. */ |
---|
| 394 | void |
---|
| 395 | unshuffle(const size_t bytesoftype, const size_t blocksize, |
---|
| 396 | const uint8_t* _src, const uint8_t* _dest) { |
---|
| 397 | /* Initialize the shuffle implementation if necessary. */ |
---|
| 398 | init_shuffle_implementation(); |
---|
| 399 | |
---|
| 400 | /* The implementation is initialized. |
---|
| 401 | Dispatch to it's unshuffle routine. */ |
---|
| 402 | (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest); |
---|
[00587dc] | 403 | } |
---|
| 404 | |
---|
[981e22c] | 405 | /* Bit-shuffle a block by dynamically dispatching to the appropriate |
---|
| 406 | hardware-accelerated routine at run-time. */ |
---|
| 407 | int |
---|
| 408 | bitshuffle(const size_t bytesoftype, const size_t blocksize, |
---|
| 409 | const uint8_t* const _src, const uint8_t* _dest, |
---|
| 410 | const uint8_t* _tmp) { |
---|
| 411 | int size = blocksize / bytesoftype; |
---|
| 412 | /* Initialize the shuffle implementation if necessary. */ |
---|
| 413 | init_shuffle_implementation(); |
---|
| 414 | |
---|
| 415 | if ((size % 8) == 0) |
---|
| 416 | /* The number of elems is a multiple of 8 which is supported by |
---|
| 417 | bitshuffle. */ |
---|
| 418 | return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest, |
---|
| 419 | blocksize / bytesoftype, |
---|
| 420 | bytesoftype, (void*)_tmp); |
---|
| 421 | else |
---|
| 422 | memcpy((void*)_dest, (void*)_src, blocksize); |
---|
| 423 | return size; |
---|
[00587dc] | 424 | } |
---|
| 425 | |
---|
[981e22c] | 426 | /* Bit-unshuffle a block by dynamically dispatching to the appropriate |
---|
| 427 | hardware-accelerated routine at run-time. */ |
---|
| 428 | int |
---|
| 429 | bitunshuffle(const size_t bytesoftype, const size_t blocksize, |
---|
| 430 | const uint8_t* const _src, const uint8_t* _dest, |
---|
| 431 | const uint8_t* _tmp) { |
---|
| 432 | int size = blocksize / bytesoftype; |
---|
| 433 | /* Initialize the shuffle implementation if necessary. */ |
---|
| 434 | init_shuffle_implementation(); |
---|
| 435 | |
---|
| 436 | if ((size % 8) == 0) |
---|
| 437 | /* The number of elems is a multiple of 8 which is supported by |
---|
| 438 | bitshuffle. */ |
---|
| 439 | return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest, |
---|
| 440 | blocksize / bytesoftype, |
---|
| 441 | bytesoftype, (void*)_tmp); |
---|
| 442 | else |
---|
| 443 | memcpy((void*)_dest, (void*)_src, blocksize); |
---|
| 444 | return size; |
---|
| 445 | } |
---|