[2c47b73] | 1 | /** |
---|
| 2 | * @file sz_omp.c |
---|
| 3 | * @author Xin Liang |
---|
| 4 | * @date July, 2017 |
---|
| 5 | * @brief the implementation of openMP version |
---|
| 6 | * (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory. |
---|
| 7 | * See COPYRIGHT in top-level directory. |
---|
| 8 | */ |
---|
| 9 | |
---|
| 10 | #include "sz_omp.h" |
---|
| 11 | #include <math.h> |
---|
| 12 | #include <time.h> |
---|
| 13 | |
---|
| 14 | unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size){ |
---|
| 15 | return NULL; |
---|
| 16 | } |
---|
| 17 | unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){ |
---|
| 18 | return NULL; |
---|
| 19 | } |
---|
| 20 | |
---|
| 21 | unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){ |
---|
| 22 | |
---|
| 23 | double elapsed_time = 0.0; |
---|
| 24 | |
---|
| 25 | elapsed_time = -omp_get_wtime(); |
---|
| 26 | unsigned int quantization_intervals; |
---|
| 27 | if(exe_params->optQuantMode==1) |
---|
| 28 | { |
---|
| 29 | // quantization_intervals = optimize_intervals_float_3D(oriData, r1, realPrecision); |
---|
| 30 | quantization_intervals = optimize_intervals_float_3D_opt(oriData, r1, r2, r3, realPrecision); |
---|
| 31 | //quantization_intervals = 32768; |
---|
| 32 | printf("3D number of bins: %d\nerror bound %.20f\n", quantization_intervals, realPrecision); |
---|
| 33 | // exit(0); |
---|
| 34 | updateQuantizationInfo(quantization_intervals); |
---|
| 35 | } |
---|
| 36 | else{ |
---|
| 37 | quantization_intervals = exe_params->intvCapacity; |
---|
| 38 | } |
---|
| 39 | elapsed_time += omp_get_wtime(); |
---|
| 40 | printf("opt interval time: %.4f\n", elapsed_time); |
---|
| 41 | |
---|
| 42 | elapsed_time = -omp_get_wtime(); |
---|
| 43 | int thread_num = omp_get_max_threads(); |
---|
| 44 | int thread_order = (int)log2(thread_num); |
---|
| 45 | size_t num_x = 0, num_y = 0, num_z = 0; |
---|
| 46 | { |
---|
| 47 | int block_thread_order = thread_order / 3; |
---|
| 48 | switch(thread_order % 3){ |
---|
| 49 | case 0:{ |
---|
| 50 | num_x = 1 << block_thread_order; |
---|
| 51 | num_y = 1 << block_thread_order; |
---|
| 52 | num_z = 1 << block_thread_order; |
---|
| 53 | break; |
---|
| 54 | } |
---|
| 55 | case 1:{ |
---|
| 56 | num_x = 1 << (block_thread_order + 1); |
---|
| 57 | num_y = 1 << block_thread_order; |
---|
| 58 | num_z = 1 << block_thread_order; |
---|
| 59 | break; |
---|
| 60 | } |
---|
| 61 | case 2:{ |
---|
| 62 | num_x = 1 << (block_thread_order + 1); |
---|
| 63 | num_y = 1 << (block_thread_order + 1); |
---|
| 64 | num_z = 1 << block_thread_order; |
---|
| 65 | break; |
---|
| 66 | } |
---|
| 67 | } |
---|
| 68 | thread_num = num_x * num_y * num_z; |
---|
| 69 | } |
---|
| 70 | omp_set_num_threads(thread_num); |
---|
| 71 | // calculate block dims |
---|
| 72 | printf("number of blocks: %zu %zu %zu\n", num_x, num_y, num_z); |
---|
| 73 | |
---|
| 74 | size_t split_index_x, split_index_y, split_index_z; |
---|
| 75 | size_t early_blockcount_x, early_blockcount_y, early_blockcount_z; |
---|
| 76 | size_t late_blockcount_x, late_blockcount_y, late_blockcount_z; |
---|
| 77 | SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x); |
---|
| 78 | SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y); |
---|
| 79 | SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z); |
---|
| 80 | |
---|
| 81 | size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z; |
---|
| 82 | size_t num_blocks = num_x * num_y * num_z; |
---|
| 83 | size_t num_elements = r1 * r2 * r3; |
---|
| 84 | // printf("max_num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks); |
---|
| 85 | |
---|
| 86 | size_t dim0_offset = r2 * r3; |
---|
| 87 | size_t dim1_offset = r3; |
---|
| 88 | |
---|
| 89 | // printf("malloc blockinfo array start\n"); |
---|
| 90 | // fflush(stdout); |
---|
| 91 | |
---|
| 92 | size_t buffer_size = early_blockcount_y * early_blockcount_z * sizeof(float); |
---|
| 93 | int * result_type = (int *) malloc(num_elements * sizeof(int)); |
---|
| 94 | size_t unpred_data_max_size = max_num_block_elements; |
---|
| 95 | float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks); |
---|
| 96 | unsigned int * unpredictable_count = (unsigned int *) malloc(num_blocks * sizeof(unsigned int)); |
---|
| 97 | float * mean = malloc(num_blocks * sizeof(float)); |
---|
| 98 | float * buffer0, * buffer1; |
---|
| 99 | buffer0 = (float *) malloc(buffer_size * thread_num); |
---|
| 100 | buffer1 = (float *) malloc(buffer_size * thread_num); |
---|
| 101 | unsigned char * result = (unsigned char *) malloc(num_elements * (sizeof(int) + sizeof(float))); |
---|
| 102 | size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t)); |
---|
| 103 | unsigned char * encoding_buffer = (unsigned char *) malloc(max_num_block_elements * sizeof(int) * num_blocks); |
---|
| 104 | size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t)); |
---|
| 105 | size_t *freq = (size_t *)malloc(thread_num*quantization_intervals*4*sizeof(size_t)); |
---|
| 106 | memset(freq, 0, thread_num*quantization_intervals*4*sizeof(size_t)); |
---|
| 107 | |
---|
| 108 | size_t stateNum = quantization_intervals*2; |
---|
| 109 | HuffmanTree* huffmanTree = createHuffmanTree(stateNum); |
---|
| 110 | |
---|
| 111 | int num_yz = num_y * num_z; |
---|
| 112 | #pragma omp parallel for |
---|
| 113 | for(int t=0; t<thread_num; t++){ |
---|
| 114 | int id = omp_get_thread_num(); |
---|
| 115 | int i = id/(num_yz); |
---|
| 116 | int j = (id % num_yz) / num_z; |
---|
| 117 | int k = id % num_z; |
---|
| 118 | // printf("%d: %d %d %d\n", omp_get_thread_num(), i, j, k); |
---|
| 119 | size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x; |
---|
| 120 | size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y; |
---|
| 121 | size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z; |
---|
| 122 | float * data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z; |
---|
| 123 | |
---|
| 124 | size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x; |
---|
| 125 | size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y; |
---|
| 126 | size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z; |
---|
| 127 | size_t type_offset = offset_x * dim0_offset + offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y; |
---|
| 128 | int * type = result_type + type_offset; |
---|
| 129 | |
---|
| 130 | float * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size; |
---|
| 131 | float *P0, *P1; // buffer |
---|
| 132 | // P0 = (float *) malloc(buffer_size); |
---|
| 133 | // P1 = (float *) malloc(buffer_size); |
---|
| 134 | P0 = buffer0 + id * early_blockcount_y * early_blockcount_z; |
---|
| 135 | P1 = buffer1 + id * early_blockcount_y * early_blockcount_z; |
---|
| 136 | unpredictable_count[id] = SZ_compress_float_3D_MDQ_RA_block(data_pos, mean + id, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, P0, P1, type, unpredictable_data); |
---|
| 137 | // free(P0); |
---|
| 138 | // free(P1); |
---|
| 139 | } |
---|
| 140 | elapsed_time += omp_get_wtime(); |
---|
| 141 | printf("compression and quantization time: %.4f\n", elapsed_time); |
---|
| 142 | elapsed_time = -omp_get_wtime(); |
---|
| 143 | // printf("unpred count:\n"); |
---|
| 144 | // for(int i=0; i<num_blocks; i++){ |
---|
| 145 | // printf("%d ", unpredictable_count[i]); |
---|
| 146 | // } |
---|
| 147 | // printf("\n"); |
---|
| 148 | // printf("total_unpred num: %d\n", total_unpred); |
---|
| 149 | // printf("Block wise compression end, num_elements %ld\n", num_elements); |
---|
| 150 | // huffman encode |
---|
| 151 | |
---|
| 152 | size_t nodeCount = 0; |
---|
| 153 | Huffman_init_openmp(huffmanTree, result_type, num_elements, thread_num, freq); |
---|
| 154 | elapsed_time += omp_get_wtime(); |
---|
| 155 | printf("Build Huffman: %.4f\n", elapsed_time); |
---|
| 156 | elapsed_time = -omp_get_wtime(); |
---|
| 157 | for (size_t i = 0; i < stateNum; i++) |
---|
| 158 | if (huffmanTree->code[i]) nodeCount++; |
---|
| 159 | nodeCount = nodeCount*2-1; |
---|
| 160 | unsigned char *treeBytes; |
---|
| 161 | unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes); |
---|
| 162 | |
---|
| 163 | unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength; |
---|
| 164 | size_t total_unpred = 0; |
---|
| 165 | for(int i=0; i<num_blocks; i++){ |
---|
| 166 | total_unpred += unpredictable_count[i]; |
---|
| 167 | // printf("%d: %d mean %.2f\n", i, unpredictable_count[i], mean[i]); |
---|
| 168 | } |
---|
| 169 | unsigned char * result_pos = result; |
---|
| 170 | initRandomAccessBytes(result_pos); |
---|
| 171 | result_pos += meta_data_offset; |
---|
| 172 | |
---|
| 173 | size_t enCodeSize = 0; |
---|
| 174 | |
---|
| 175 | intToBytes_bigEndian(result_pos, thread_num); |
---|
| 176 | result_pos += 4; |
---|
| 177 | doubleToBytes(result_pos, realPrecision); |
---|
| 178 | result_pos += 8; |
---|
| 179 | intToBytes_bigEndian(result_pos, quantization_intervals); |
---|
| 180 | result_pos += 4; |
---|
| 181 | intToBytes_bigEndian(result_pos, treeByteSize); |
---|
| 182 | result_pos += 4; |
---|
| 183 | intToBytes_bigEndian(result_pos, nodeCount); |
---|
| 184 | result_pos += 4; |
---|
| 185 | memcpy(result_pos, treeBytes, treeByteSize); |
---|
| 186 | result_pos += treeByteSize; |
---|
| 187 | |
---|
| 188 | memcpy(result_pos, unpredictable_count, num_blocks * sizeof(unsigned int)); |
---|
| 189 | result_pos += num_blocks * sizeof(unsigned int); |
---|
| 190 | memcpy(result_pos, mean, num_blocks * sizeof(float)); |
---|
| 191 | result_pos += num_blocks * sizeof(float); |
---|
| 192 | // printf("unpred offset: %ld\n", result_pos - result); |
---|
| 193 | // store unpredicable data |
---|
| 194 | // float * unpred_pos = (float *) result_pos; |
---|
| 195 | // for(int t=0; t<thread_num; t++){ |
---|
| 196 | // float * unpredictable_data = result_unpredictable_data + t * unpred_data_max_size; |
---|
| 197 | // memcpy(result_pos, unpredictable_data, unpredictable_count[t] * sizeof(float)); |
---|
| 198 | // result_pos += unpredictable_count[t]*sizeof(float); |
---|
| 199 | // } |
---|
| 200 | unpred_offset[0] = 0; |
---|
| 201 | for(int t=1; t<thread_num; t++){ |
---|
| 202 | unpred_offset[t] = unpredictable_count[t-1] + unpred_offset[t-1]; |
---|
| 203 | } |
---|
| 204 | #pragma omp parallel for |
---|
| 205 | for(int t=0; t<thread_num; t++){ |
---|
| 206 | int id = omp_get_thread_num(); |
---|
| 207 | float * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size; |
---|
| 208 | memcpy(result_pos + unpred_offset[id] * sizeof(float), unpredictable_data, unpredictable_count[id] * sizeof(float)); |
---|
| 209 | } |
---|
| 210 | result_pos += total_unpred * sizeof(float); |
---|
| 211 | |
---|
| 212 | elapsed_time += omp_get_wtime(); |
---|
| 213 | printf("write misc time: %.4f\n", elapsed_time); |
---|
| 214 | elapsed_time = -omp_get_wtime(); |
---|
| 215 | |
---|
| 216 | size_t * block_pos = (size_t *) result_pos; |
---|
| 217 | result_pos += num_blocks * sizeof(size_t); |
---|
| 218 | #pragma omp parallel for |
---|
| 219 | for(int t=0; t<thread_num; t++){ |
---|
| 220 | int id = omp_get_thread_num(); |
---|
| 221 | int i = id/(num_yz); |
---|
| 222 | int j = (id % num_yz) / num_z; |
---|
| 223 | int k = id % num_z; |
---|
| 224 | unsigned char * encoding_buffer_pos = encoding_buffer + id * max_num_block_elements * sizeof(int); |
---|
| 225 | size_t enCodeSize = 0; |
---|
| 226 | size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x; |
---|
| 227 | size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y; |
---|
| 228 | size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z; |
---|
| 229 | size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x; |
---|
| 230 | size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y; |
---|
| 231 | size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z; |
---|
| 232 | size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z; |
---|
| 233 | size_t type_offset = offset_x * dim0_offset + offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y; |
---|
| 234 | int * type = result_type + type_offset; |
---|
| 235 | encode(huffmanTree, type, current_block_elements, encoding_buffer_pos, &enCodeSize); |
---|
| 236 | block_pos[id] = enCodeSize; |
---|
| 237 | } |
---|
| 238 | elapsed_time += omp_get_wtime(); |
---|
| 239 | printf("Parallel Huffman encoding elapsed time: %.4f\n", elapsed_time); |
---|
| 240 | elapsed_time = -omp_get_wtime(); |
---|
| 241 | // for(int t=0; t<thread_num; t++){ |
---|
| 242 | // memcpy(result_pos, encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]); |
---|
| 243 | // result_pos += block_pos[t]; |
---|
| 244 | // } |
---|
| 245 | block_offset[0] = 0; |
---|
| 246 | for(int t=1; t<thread_num; t++){ |
---|
| 247 | block_offset[t] = block_pos[t-1] + block_offset[t-1]; |
---|
| 248 | } |
---|
| 249 | #pragma omp parallel for |
---|
| 250 | for(int t=0; t<thread_num; t++){ |
---|
| 251 | int id = omp_get_thread_num(); |
---|
| 252 | memcpy(result_pos + block_offset[id], encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]); |
---|
| 253 | } |
---|
| 254 | result_pos += block_offset[thread_num - 1] + block_pos[thread_num - 1]; |
---|
| 255 | |
---|
| 256 | elapsed_time += omp_get_wtime(); |
---|
| 257 | printf("Final copy elapsed time: %.4f\n", elapsed_time); |
---|
| 258 | // { |
---|
| 259 | // int status; |
---|
| 260 | // writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/comp001_type.dat", &status); |
---|
| 261 | // } |
---|
| 262 | |
---|
| 263 | // int status; |
---|
| 264 | // writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/omp_type.dat", &status); |
---|
| 265 | // printf("type array size: %ld\n", enCodeSize); |
---|
| 266 | result_pos += enCodeSize; |
---|
| 267 | size_t totalEncodeSize = 0; |
---|
| 268 | totalEncodeSize = result_pos - result; |
---|
| 269 | // printf("Total size %ld\n", totalEncodeSize); |
---|
| 270 | free(freq); |
---|
| 271 | free(buffer0); |
---|
| 272 | free(buffer1); |
---|
| 273 | free(treeBytes); |
---|
| 274 | free(unpred_offset); |
---|
| 275 | free(block_offset); |
---|
| 276 | free(encoding_buffer); |
---|
| 277 | free(mean); |
---|
| 278 | free(result_unpredictable_data); |
---|
| 279 | free(unpredictable_count); |
---|
| 280 | free(result_type); |
---|
| 281 | SZ_ReleaseHuffman(huffmanTree); |
---|
| 282 | |
---|
| 283 | *comp_size = totalEncodeSize; |
---|
| 284 | return result; |
---|
| 285 | } |
---|
| 286 | |
---|
| 287 | void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data){ |
---|
| 288 | } |
---|
| 289 | void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data){ |
---|
| 290 | } |
---|
| 291 | |
---|
| 292 | |
---|
| 293 | void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){ |
---|
| 294 | |
---|
| 295 | if(confparams_dec==NULL) |
---|
| 296 | confparams_dec = (sz_params*)malloc(sizeof(sz_params)); |
---|
| 297 | memset(confparams_dec, 0, sizeof(sz_params)); |
---|
| 298 | if(exe_params==NULL) |
---|
| 299 | exe_params = (sz_exedata*)malloc(sizeof(sz_exedata)); |
---|
| 300 | memset(exe_params, 0, sizeof(sz_exedata)); |
---|
| 301 | |
---|
| 302 | // printf("num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks); |
---|
| 303 | // fflush(stdout); |
---|
| 304 | double elapsed_time = 0.0; |
---|
| 305 | elapsed_time = -omp_get_wtime(); |
---|
| 306 | |
---|
| 307 | size_t dim0_offset = r2 * r3; |
---|
| 308 | size_t dim1_offset = r3; |
---|
| 309 | size_t num_elements = r1 * r2 * r3; |
---|
| 310 | |
---|
| 311 | unsigned char * comp_data_pos = comp_data; |
---|
| 312 | //int meta_data_offset = 3 + 1 + MetaDataByteLength; |
---|
| 313 | //comp_data_pos += meta_data_offset; |
---|
| 314 | |
---|
| 315 | int thread_num = bytesToInt_bigEndian(comp_data_pos); |
---|
| 316 | comp_data_pos += 4; |
---|
| 317 | int thread_order = (int)log2(thread_num); |
---|
| 318 | size_t num_x = 0, num_y = 0, num_z = 0; |
---|
| 319 | { |
---|
| 320 | int block_thread_order = thread_order / 3; |
---|
| 321 | switch(thread_order % 3){ |
---|
| 322 | case 0:{ |
---|
| 323 | num_x = 1 << block_thread_order; |
---|
| 324 | num_y = 1 << block_thread_order; |
---|
| 325 | num_z = 1 << block_thread_order; |
---|
| 326 | break; |
---|
| 327 | } |
---|
| 328 | case 1:{ |
---|
| 329 | num_x = 1 << (block_thread_order + 1); |
---|
| 330 | num_y = 1 << block_thread_order; |
---|
| 331 | num_z = 1 << block_thread_order; |
---|
| 332 | break; |
---|
| 333 | } |
---|
| 334 | case 2:{ |
---|
| 335 | num_x = 1 << (block_thread_order + 1); |
---|
| 336 | num_y = 1 << (block_thread_order + 1); |
---|
| 337 | num_z = 1 << block_thread_order; |
---|
| 338 | break; |
---|
| 339 | } |
---|
| 340 | } |
---|
| 341 | } |
---|
| 342 | printf("number of blocks: %zu %zu %zu, thread_num %d\n", num_x, num_y, num_z, thread_num); |
---|
| 343 | omp_set_num_threads(thread_num); |
---|
| 344 | size_t split_index_x, split_index_y, split_index_z; |
---|
| 345 | size_t early_blockcount_x, early_blockcount_y, early_blockcount_z; |
---|
| 346 | size_t late_blockcount_x, late_blockcount_y, late_blockcount_z; |
---|
| 347 | SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x); |
---|
| 348 | SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y); |
---|
| 349 | SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z); |
---|
| 350 | |
---|
| 351 | size_t num_blocks = num_x * num_y * num_z; |
---|
| 352 | size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t)); |
---|
| 353 | *data = (float*)malloc(sizeof(float)*num_elements); |
---|
| 354 | int * result_type = (int *) malloc(num_elements * sizeof(int)); |
---|
| 355 | size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t)); |
---|
| 356 | |
---|
| 357 | double realPrecision = bytesToDouble(comp_data_pos); |
---|
| 358 | comp_data_pos += 8; |
---|
| 359 | unsigned int intervals = bytesToInt_bigEndian(comp_data_pos); |
---|
| 360 | comp_data_pos += 4; |
---|
| 361 | |
---|
| 362 | size_t stateNum = intervals*2; |
---|
| 363 | HuffmanTree* huffmanTree = createHuffmanTree(stateNum); |
---|
| 364 | |
---|
| 365 | updateQuantizationInfo(intervals); |
---|
| 366 | // exe_params->intvRadius = (int)((tdps->intervals - 1)/ 2); |
---|
| 367 | |
---|
| 368 | unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos); |
---|
| 369 | comp_data_pos += 4; |
---|
| 370 | size_t huffman_nodes = bytesToInt_bigEndian(comp_data_pos); |
---|
| 371 | huffmanTree->allNodes = huffman_nodes; |
---|
| 372 | // printf("Reconstruct huffman tree with node count %ld\n", nodeCount); |
---|
| 373 | // fflush(stdout); |
---|
| 374 | node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, huffmanTree->allNodes); |
---|
| 375 | |
---|
| 376 | comp_data_pos += 4 + tree_size; |
---|
| 377 | unsigned int * unpred_count = (unsigned int *) comp_data_pos; |
---|
| 378 | comp_data_pos += num_blocks * sizeof(unsigned int); |
---|
| 379 | float * mean_pos = (float *) comp_data_pos; |
---|
| 380 | comp_data_pos += num_blocks * sizeof(float); |
---|
| 381 | float * result_unpredictable_data = (float *) comp_data_pos; |
---|
| 382 | size_t total_unpred = 0; |
---|
| 383 | for(int i=0; i<num_blocks; i++){ |
---|
| 384 | unpred_offset[i] = total_unpred; |
---|
| 385 | total_unpred += unpred_count[i]; |
---|
| 386 | } |
---|
| 387 | comp_data_pos += total_unpred * sizeof(float); |
---|
| 388 | |
---|
| 389 | // printf("unpred count:\n"); |
---|
| 390 | // for(int i=0; i<num_blocks; i++){ |
---|
| 391 | // printf("%d ", unpred_count[i]); |
---|
| 392 | // } |
---|
| 393 | // printf("\n"); |
---|
| 394 | // for(int i=0; i<1000; i++){ |
---|
| 395 | // printf("%.2f ", result_unpredictable_data[i]); |
---|
| 396 | // } |
---|
| 397 | // printf("\ntotal_unpred num: %d\n", total_unpred); |
---|
| 398 | |
---|
| 399 | // for(int i=0; i<num_blocks; i++){ |
---|
| 400 | // printf("%d unpred offset %ld\n", i, unpred_offset[i]); |
---|
| 401 | // for(int tmp=0; tmp<10; tmp++){ |
---|
| 402 | // printf("%.2f ", (result_unpredictable_data + unpred_offset[i])[tmp]); |
---|
| 403 | // } |
---|
| 404 | // printf("\n"); |
---|
| 405 | // } |
---|
| 406 | // exit(0); |
---|
| 407 | // printf("Block wise decompression start: %d %d %d\n", early_blockcount_x, early_blockcount_y, early_blockcount_z); |
---|
| 408 | // fflush(stdout); |
---|
| 409 | // decode(comp_data_pos, num_elements, root, result_type); |
---|
| 410 | size_t * block_pos = (size_t *) comp_data_pos; |
---|
| 411 | comp_data_pos += num_blocks * sizeof(size_t); |
---|
| 412 | block_offset[0] = 0; |
---|
| 413 | for(int t=1; t<thread_num; t++){ |
---|
| 414 | block_offset[t] = block_pos[t-1] + block_offset[t-1]; |
---|
| 415 | } |
---|
| 416 | int num_yz = num_y * num_z; |
---|
| 417 | elapsed_time += omp_get_wtime(); |
---|
| 418 | printf("Read data info elapsed time: %.4f\n", elapsed_time); |
---|
| 419 | elapsed_time = -omp_get_wtime(); |
---|
| 420 | #pragma omp parallel for |
---|
| 421 | for(int t=0; t<thread_num; t++){ |
---|
| 422 | int id = omp_get_thread_num(); |
---|
| 423 | int i = id/(num_yz); |
---|
| 424 | int j = (id % num_yz) / num_z; |
---|
| 425 | int k = id % num_z; |
---|
| 426 | size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x; |
---|
| 427 | size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y; |
---|
| 428 | size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z; |
---|
| 429 | size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x; |
---|
| 430 | size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y; |
---|
| 431 | size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z; |
---|
| 432 | size_t type_offset = offset_x * dim0_offset + offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y; |
---|
| 433 | int * type = result_type + type_offset; |
---|
| 434 | decode(comp_data_pos + block_offset[id], current_blockcount_x*current_blockcount_y*current_blockcount_z, root, type); |
---|
| 435 | } |
---|
| 436 | elapsed_time += omp_get_wtime(); |
---|
| 437 | printf("Parallel Huffman decoding elapsed time: %.4f\n", elapsed_time); |
---|
| 438 | elapsed_time = -omp_get_wtime(); |
---|
| 439 | |
---|
| 440 | #pragma omp parallel for |
---|
| 441 | for(int t=0; t<thread_num; t++){ |
---|
| 442 | int id = omp_get_thread_num(); |
---|
| 443 | int i = id/(num_yz); |
---|
| 444 | int j = (id % num_yz) / num_z; |
---|
| 445 | int k = id % num_z; |
---|
| 446 | // printf("%d: %d %d %d\n", omp_get_thread_num(), i, j, k); |
---|
| 447 | size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x; |
---|
| 448 | size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y; |
---|
| 449 | size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z; |
---|
| 450 | float * data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z; |
---|
| 451 | |
---|
| 452 | size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x; |
---|
| 453 | size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y; |
---|
| 454 | size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z; |
---|
| 455 | size_t type_offset = offset_x * dim0_offset + offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y; |
---|
| 456 | int * type = result_type + type_offset; |
---|
| 457 | |
---|
| 458 | float * unpredictable_data = result_unpredictable_data + unpred_offset[id]; |
---|
| 459 | float mean = mean_pos[id]; |
---|
| 460 | // printf("\n%d\ndata_offset: %ld\n", t, offset_x * dim0_offset + offset_y * dim1_offset + offset_z); |
---|
| 461 | // printf("mean: %.2f\n", mean); |
---|
| 462 | // for(int tmp=0; tmp<10; tmp++){ |
---|
| 463 | // printf("%.2f ", unpredictable_data[tmp]); |
---|
| 464 | // } |
---|
| 465 | // printf("\n\n"); |
---|
| 466 | decompressDataSeries_float_3D_RA_block(data_pos, mean, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, type, unpredictable_data); |
---|
| 467 | } |
---|
| 468 | elapsed_time += omp_get_wtime(); |
---|
| 469 | printf("Parallel decompress elapsed time: %.4f\n", elapsed_time); |
---|
| 470 | |
---|
| 471 | free(block_offset); |
---|
| 472 | free(result_type); |
---|
| 473 | free(unpred_offset); |
---|
| 474 | SZ_ReleaseHuffman(huffmanTree); |
---|
| 475 | } |
---|
| 476 | |
---|
| 477 | void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq){ |
---|
| 478 | |
---|
| 479 | size_t i; |
---|
| 480 | // size_t *freq = (size_t *)malloc(thread_num*huffmanTree->allNodes*sizeof(size_t)); |
---|
| 481 | // memset(freq, 0, thread_num*huffmanTree->allNodes*sizeof(size_t)); |
---|
| 482 | size_t block_size = (length - 1)/ thread_num + 1; |
---|
| 483 | size_t block_residue = length - (thread_num - 1) * block_size; |
---|
| 484 | #pragma omp parallel for |
---|
| 485 | for(int t=0; t<thread_num; t++){ |
---|
| 486 | int id = omp_get_thread_num(); |
---|
| 487 | int * s_pos = s + id * block_size; |
---|
| 488 | size_t * freq_pos = freq + id * huffmanTree->allNodes; |
---|
| 489 | if(id < thread_num - 1){ |
---|
| 490 | for(size_t i=0; i<block_size; i++){ |
---|
| 491 | freq_pos[s_pos[i]] ++; |
---|
| 492 | } |
---|
| 493 | } |
---|
| 494 | else{ |
---|
| 495 | for(size_t i=0; i<block_residue; i++){ |
---|
| 496 | freq_pos[s_pos[i]] ++; |
---|
| 497 | } |
---|
| 498 | } |
---|
| 499 | } |
---|
| 500 | size_t * freq_pos = freq + huffmanTree->allNodes; |
---|
| 501 | for(int t=1; t<thread_num; t++){ |
---|
| 502 | for(i = 0; i<huffmanTree->allNodes; i++){ |
---|
| 503 | freq[i] += freq_pos[i]; |
---|
| 504 | } |
---|
| 505 | freq_pos += huffmanTree->allNodes; |
---|
| 506 | } |
---|
| 507 | |
---|
| 508 | for (i = 0; i < huffmanTree->allNodes; i++) |
---|
| 509 | if (freq[i]) |
---|
| 510 | qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0)); |
---|
| 511 | |
---|
| 512 | while (huffmanTree->qend > 2) |
---|
| 513 | qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree))); |
---|
| 514 | |
---|
| 515 | build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0); |
---|
| 516 | // free(freq); |
---|
| 517 | } |
---|
| 518 | |
---|
| 519 | |
---|
| 520 | |
---|