/** * @file sz_omp.c * @author Xin Liang * @date July, 2017 * @brief the implementation of openMP version * (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ #include "sz_omp.h" #include #include unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size){ return NULL; } unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){ return NULL; } unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){ double elapsed_time = 0.0; elapsed_time = -omp_get_wtime(); unsigned int quantization_intervals; if(exe_params->optQuantMode==1) { // quantization_intervals = optimize_intervals_float_3D(oriData, r1, realPrecision); quantization_intervals = optimize_intervals_float_3D_opt(oriData, r1, r2, r3, realPrecision); //quantization_intervals = 32768; printf("3D number of bins: %d\nerror bound %.20f\n", quantization_intervals, realPrecision); // exit(0); updateQuantizationInfo(quantization_intervals); } else{ quantization_intervals = exe_params->intvCapacity; } elapsed_time += omp_get_wtime(); printf("opt interval time: %.4f\n", elapsed_time); elapsed_time = -omp_get_wtime(); int thread_num = omp_get_max_threads(); int thread_order = (int)log2(thread_num); size_t num_x = 0, num_y = 0, num_z = 0; { int block_thread_order = thread_order / 3; switch(thread_order % 3){ case 0:{ num_x = 1 << block_thread_order; num_y = 1 << block_thread_order; num_z = 1 << block_thread_order; break; } case 1:{ num_x = 1 << (block_thread_order + 1); num_y = 1 << block_thread_order; num_z = 1 << block_thread_order; break; } case 2:{ num_x = 1 << (block_thread_order + 1); num_y = 1 << (block_thread_order + 1); num_z = 1 << block_thread_order; break; } } thread_num = num_x * num_y * num_z; } omp_set_num_threads(thread_num); // calculate block dims printf("number of blocks: %zu %zu %zu\n", num_x, num_y, num_z); size_t split_index_x, split_index_y, split_index_z; size_t early_blockcount_x, early_blockcount_y, early_blockcount_z; size_t late_blockcount_x, late_blockcount_y, late_blockcount_z; SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x); SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y); SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z); size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z; size_t num_blocks = num_x * num_y * num_z; size_t num_elements = r1 * r2 * r3; // printf("max_num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks); size_t dim0_offset = r2 * r3; size_t dim1_offset = r3; // printf("malloc blockinfo array start\n"); // fflush(stdout); size_t buffer_size = early_blockcount_y * early_blockcount_z * sizeof(float); int * result_type = (int *) malloc(num_elements * sizeof(int)); size_t unpred_data_max_size = max_num_block_elements; float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks); unsigned int * unpredictable_count = (unsigned int *) malloc(num_blocks * sizeof(unsigned int)); float * mean = malloc(num_blocks * sizeof(float)); float * buffer0, * buffer1; buffer0 = (float *) malloc(buffer_size * thread_num); buffer1 = (float *) malloc(buffer_size * thread_num); unsigned char * result = (unsigned char *) malloc(num_elements * (sizeof(int) + sizeof(float))); size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t)); unsigned char * encoding_buffer = (unsigned char *) malloc(max_num_block_elements * sizeof(int) * num_blocks); size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t)); size_t *freq = (size_t *)malloc(thread_num*quantization_intervals*4*sizeof(size_t)); memset(freq, 0, thread_num*quantization_intervals*4*sizeof(size_t)); size_t stateNum = quantization_intervals*2; HuffmanTree* huffmanTree = createHuffmanTree(stateNum); int num_yz = num_y * num_z; #pragma omp parallel for for(int t=0; tcode[i]) nodeCount++; nodeCount = nodeCount*2-1; unsigned char *treeBytes; unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes); unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength; size_t total_unpred = 0; for(int i=0; iintvRadius = (int)((tdps->intervals - 1)/ 2); unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos); comp_data_pos += 4; size_t huffman_nodes = bytesToInt_bigEndian(comp_data_pos); huffmanTree->allNodes = huffman_nodes; // printf("Reconstruct huffman tree with node count %ld\n", nodeCount); // fflush(stdout); node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, huffmanTree->allNodes); comp_data_pos += 4 + tree_size; unsigned int * unpred_count = (unsigned int *) comp_data_pos; comp_data_pos += num_blocks * sizeof(unsigned int); float * mean_pos = (float *) comp_data_pos; comp_data_pos += num_blocks * sizeof(float); float * result_unpredictable_data = (float *) comp_data_pos; size_t total_unpred = 0; for(int i=0; iallNodes*sizeof(size_t)); // memset(freq, 0, thread_num*huffmanTree->allNodes*sizeof(size_t)); size_t block_size = (length - 1)/ thread_num + 1; size_t block_residue = length - (thread_num - 1) * block_size; #pragma omp parallel for for(int t=0; tallNodes; if(id < thread_num - 1){ for(size_t i=0; iallNodes; for(int t=1; tallNodes; i++){ freq[i] += freq_pos[i]; } freq_pos += huffmanTree->allNodes; } for (i = 0; i < huffmanTree->allNodes; i++) if (freq[i]) qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0)); while (huffmanTree->qend > 2) qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree))); build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0); // free(freq); }