Context Navigation

← Previous Change
Next Change →

sz_float.c

Timestamp:

09/28/18 16:32:55 (6 years ago)

Author:

Hal Finkel <hfinkel@…>

Branches:

master, pympi

Children:

e6aa0eb

Parents:

abca157

git-author:

Hal Finkel <hfinkel@…> (09/28/18 16:32:55)

git-committer:

Hal Finkel <hfinkel@…> (09/28/18 16:32:55)

Message:

importing new SZ files

File:

: 1 edited

thirdparty/SZ/sz/src/sz_float.c (modified) (24 diffs)

Legend:

: Unmodified
: Added
: Removed

thirdparty/SZ/sz/src/sz_float.c

-                      r2c47b73
+                      r9ee2ce3
 /**
  *  @file sz_float.c
  *  @author Sheng Di and Dingwen Tao
+ *  @author Sheng Di, Dingwen Tao, Xin Liang
  *  @date Aug, 2016
  *  @brief SZ_Init, Compression and Decompression functions
 …
 #include "rw.h"
 #include "sz_float_ts.h"
+#include "utility.h"
 unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize)
 …
                 pred = last3CmprsData[0];
                 predAbsErr = fabs(curData - pred);
                 if(predAbsErr<=checkRadius)
+                if(predAbsErr<checkRadius)
+                {
                         state = (predAbsErr/realPrecision+1)/2;
 …
+                }
                 else
+                {
+                        tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+                {
+                        if(sz_with_regression == SZ_NO_REGRESSION)
+                                tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+                        else
+                                *newByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r1, r2, r3, realPrecision, outSize);
                         compressionType = 0; //snapshot-based compression
                         multisteps->lastSnapshotStep = timestep;
 …
                 tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
         convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
         if(*outSize>dataLength*sizeof(float))
                 SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
         free_TightDataPointStorageF(tdps);
+        if(tdps!=NULL)
+        {
+                convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+                if(*outSize>dataLength*sizeof(float))
+                        SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+                free_TightDataPointStorageF(tdps);
+        }
         return compressionType;
+}
 …
                         if(errBoundMode>=PW_REL)
+                        {
                                 //SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, realPrecision, r1, outSize, min, max);
                                 SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);
+                                SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r1, outSize, min, max);
+                                //SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);
+                        }
                         else
 …
+                {
                         if(errBoundMode>=PW_REL)
                                 SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(newByteData, oriData, realPrecision, r2, r1, outSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r2, r1, outSize, min, max);
                         else
                                 SZ_compress_args_float_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
 …
+                {
                         if(errBoundMode>=PW_REL)
                                 SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r3, r2, r1, outSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r3, r2, r1, outSize, min, max);
                         else
                                 SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
 …
+                {
                         if(errBoundMode>=PW_REL)
                                 SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r4*r3, r2, r1, outSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(newByteData, oriData, pwrErrRatio, r4*r3, r2, r1, outSize, min, max);
                         else
                                 SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
 …
                         if(confparams_cpr->errorBoundMode>=PW_REL)
+                        {
+                                //SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(&tmpByteData, oriData, realPrecision, r1, &tmpOutSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio,
+                                valueRangeSize, medianValue, &tmpOutSize);
+                                SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r1, &tmpOutSize, min, max);
+                                //SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, valueRangeSize, medianValue, &tmpOutSize);
+                        }
                         else
 …
+                {
                         if(confparams_cpr->errorBoundMode>=PW_REL)
                                 SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(&tmpByteData, oriData, realPrecision, r2, r1, &tmpOutSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r2, r1, &tmpOutSize, min, max);
                         else
 #ifdef HAVE_TIMECMPR
 …
                                 else
 #endif
+                                        SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                {
+                                        if(sz_with_regression == SZ_NO_REGRESSION)
+                                                SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                        else
+                                                tmpByteData = SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(oriData, r2, r1, realPrecision, &tmpOutSize);
+                                }
+                }
                 else
 …
+                {
                         if(confparams_cpr->errorBoundMode>=PW_REL)
                                 SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r3, r2, r1, &tmpOutSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r3, r2, r1, &tmpOutSize, min, max);
                         else
 #ifdef HAVE_TIMECMPR
                                 if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
                                         multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                                multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
                                 else
 #endif
+                                        SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                {
+                                        if(sz_with_regression == SZ_NO_REGRESSION)
+                                                SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                        else
+                                                tmpByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r3, r2, r1, realPrecision, &tmpOutSize);
+                                }
+                }
                 else
 …
+                {
                         if(confparams_cpr->errorBoundMode>=PW_REL)
                                 SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r4*r3, r2, r1, &tmpOutSize, min, max);
+                                SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(&tmpByteData, oriData, pwRelBoundRatio, r4*r3, r2, r1, &tmpOutSize, min, max);
                                 //ToDO
                                 //SZ_compress_args_float_NoCkRngeNoGzip_4D_pwr(&tmpByteData, oriData, r4, r3, r2, r1, &tmpOutSize, min, max);
 …
                                 else
 #endif
+                                        SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                {
+                                        if(sz_with_regression == SZ_NO_REGRESSION)
+                                                SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+                                        else
+                                                tmpByteData = SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(oriData, r4*r3, r2, r1, realPrecision, &tmpOutSize);
+                                }
+                }
                 else
 …
                 else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION || confparams_cpr->szMode==SZ_TEMPORAL_COMPRESSION)
+                {
                         *outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+                        *outSize = sz_lossless_compress(confparams_cpr->losslessCompressor, confparams_cpr->gzipMode, tmpByteData, tmpOutSize, newByteData);
                         free(tmpByteData);
+                }
 …
         size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
         memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
         size_t totalSampleSize = 0;//(r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+        size_t totalSampleSize = 0;
         size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
 …
+                {
                         radiusIndex = confparams_cpr->maxRangeRadius - 1;
-                        //printf("radiusIndex=%d\n", radiusIndex);
+                }
                 intervals[radiusIndex]++;
-                // printf("TEST: %ld, i: %ld\tj: %ld\tk: %ld\n", data_pos - oriData);
-                // fflush(stdout);
                 offset_count += confparams_cpr->sampleDistance;
                 if(offset_count >= r3){
 …
                 else data_pos += confparams_cpr->sampleDistance;
+        }
-        // printf("sample_count: %ld\n", sample_count);
-        // fflush(stdout);
-        // if(*max_freq < 0.15) *max_freq *= 2;
         //compute the appropriate number
         size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
 …
                 powerOf2 = 32;
         free(intervals);
-        //printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
         return powerOf2;
+}
 …
         size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
         memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+        size_t totalSampleSize = 0;//(r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+        //float max = oriData[0];
+        //float min = oriData[0];
+        size_t totalSampleSize = 0;
         size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
 …
         while(data_pos - oriData < dataLength){
                 totalSampleSize++;
-                //pred_value = 2*data_pos[-1] - data_pos[-2];
                 pred_value = data_pos[-1];
                 pred_err = fabs(pred_value - *data_pos);
 …
         free(intervals);
-        //printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
         return powerOf2;
+}
 …
+}
+/*The above code is for sz 1.4.13; the following code is for sz 2.0*/
+unsigned int optimize_intervals_float_2D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{
+        float mean = 0.0;
+        size_t len = r1 * r2;
+        size_t mean_distance = (int) (sqrt(len));
+        float * data_pos = oriData;
+        size_t mean_count = 0;
+        while(data_pos - oriData < len){
+                mean += *data_pos;
+                mean_count ++;
+                data_pos += mean_distance;
+        }
+        if(mean_count > 0) mean /= mean_count;
+        size_t range = 8192;
+        size_t radius = 4096;
+        size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+        memset(freq_intervals, 0, range*sizeof(size_t));
+        unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+        int sampleDistance = confparams_cpr->sampleDistance;
+        float predThreshold = confparams_cpr->predThreshold;
+        size_t i;
+        size_t radiusIndex;
+        float pred_value = 0, pred_err;
+        size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+        memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+        float mean_diff;
+        ptrdiff_t freq_index;
+        size_t freq_count = 0;
+        size_t n1_count = 1;
+        size_t offset_count = sampleDistance - 1;
+        size_t offset_count_2 = 0;
+        size_t sample_count = 0;
+        data_pos = oriData + r2 + offset_count;
+        while(data_pos - oriData < len){
+                pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+                pred_err = fabs(pred_value - *data_pos);
+                if(pred_err < realPrecision) freq_count ++;
+                radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+                if(radiusIndex>=maxRangeRadius)
+                        radiusIndex = maxRangeRadius - 1;
+                intervals[radiusIndex]++;
+                mean_diff = *data_pos - mean;
+                if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+                else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+                if(freq_index <= 0){
+                        freq_intervals[0] ++;
+                }
+                else if(freq_index >= range){
+                        freq_intervals[range - 1] ++;
+                }
+                else{
+                        freq_intervals[freq_index] ++;
+                }
+                offset_count += sampleDistance;
+                if(offset_count >= r2){
+                        n1_count ++;
+                        offset_count_2 = n1_count % sampleDistance;
+                        data_pos += (r2 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+                        offset_count = (sampleDistance - offset_count_2);
+                        if(offset_count == 0) offset_count ++;
+                }
+                else data_pos += sampleDistance;
+                sample_count ++;
+        }
+        *max_freq = freq_count * 1.0/ sample_count;
+        //compute the appropriate number
+        size_t targetCount = sample_count*predThreshold;
+        size_t sum = 0;
+        for(i=0;i<maxRangeRadius;i++)
+        {
+                sum += intervals[i];
+                if(sum>targetCount)
+                        break;
+        }
+        if(i>=maxRangeRadius)
+                i = maxRangeRadius-1;
+        unsigned int accIntervals = 2*(i+1);
+        unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+        if(powerOf2<32)
+                powerOf2 = 32;
+        // collect frequency
+        size_t max_sum = 0;
+        size_t max_index = 0;
+        size_t tmp_sum;
+        size_t * freq_pos = freq_intervals + 1;
+        for(size_t i=1; i<range-2; i++){
+                tmp_sum = freq_pos[0] + freq_pos[1];
+                if(tmp_sum > max_sum){
+                        max_sum = tmp_sum;
+                        max_index = i;
+                }
+                freq_pos ++;
+        }
+        *dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+        *mean_freq = max_sum * 1.0 / sample_count;
+        free(freq_intervals);
+        free(intervals);
+        return powerOf2;
+}
+// 2D:  modified for higher performance
+#define MIN(a, b) a<b? a : b
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+        unsigned int quantization_intervals;
+        float sz_sample_correct_freq = -1;//0.5; //-1
+        float dense_pos;
+        float mean_flush_freq;
+        unsigned char use_mean = 0;
+        if(exe_params->optQuantMode==1)
+        {
+                quantization_intervals = optimize_intervals_float_2D_with_freq_and_dense_pos(oriData, r1, r2, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+                if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+                updateQuantizationInfo(quantization_intervals);
+        }
+        else{
+                quantization_intervals = exe_params->intvCapacity;
+        }
+        // calculate block dims
+        size_t num_x, num_y;
+        size_t block_size = 16;
+        SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+        SZ_COMPUTE_2D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+        size_t split_index_x, split_index_y;
+        size_t early_blockcount_x, early_blockcount_y;
+        size_t late_blockcount_x, late_blockcount_y;
+        SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+        SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+        size_t max_num_block_elements = early_blockcount_x * early_blockcount_y;
+        size_t num_blocks = num_x * num_y;
+        size_t num_elements = r1 * r2;
+        size_t dim0_offset = r2;
+        int * result_type = (int *) malloc(num_elements * sizeof(int));
+        size_t unpred_data_max_size = max_num_block_elements;
+        float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+        size_t total_unpred = 0;
+        size_t unpredictable_count;
+        float * data_pos = oriData;
+        int * type = result_type;
+        size_t offset_x, offset_y;
+        size_t current_blockcount_x, current_blockcount_y;
+        float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+        float * reg_params_pos = reg_params;
+        // move regression part out
+        size_t params_offset_b = num_blocks;
+        size_t params_offset_c = 2*num_blocks;
+        for(size_t i=0; i<num_x; i++){
+                for(size_t j=0; j<num_y; j++){
+                        current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+                        current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+                        offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+                        offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+                        data_pos = oriData + offset_x * dim0_offset + offset_y;
+                        {
+                                float * cur_data_pos = data_pos;
+                                float fx = 0.0;
+                                float fy = 0.0;
+                                float f = 0;
+                                double sum_x;
+                                float curData;
+                                for(size_t i=0; i<current_blockcount_x; i++){
+                                        sum_x = 0;
+                                        for(size_t j=0; j<current_blockcount_y; j++){
+                                                curData = *cur_data_pos;
+                                                sum_x += curData;
+                                                fy += curData * j;
+                                                cur_data_pos ++;
+                                        }
+                                        fx += sum_x * i;
+                                        f += sum_x;
+                                        cur_data_pos += dim0_offset - current_blockcount_y;
+                                }
+                                float coeff = 1.0 / (current_blockcount_x * current_blockcount_y);
+                                reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+                                reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+                                reg_params_pos[params_offset_c] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2);
+                        }
+                        reg_params_pos ++;
+                }
+        }
+        //Compress coefficient arrays
+        double precision_a, precision_b, precision_c;
+        float rel_param_err = 0.15/3;
+        precision_a = rel_param_err * realPrecision / late_blockcount_x;
+        precision_b = rel_param_err * realPrecision / late_blockcount_y;
+        precision_c = rel_param_err * realPrecision;
+        float mean = 0;
+        use_mean = 0;
+        if(use_mean){
+                // compute mean
+                double sum = 0.0;
+                size_t mean_count = 0;
+                for(size_t i=0; i<num_elements; i++){
+                        if(fabs(oriData[i] - dense_pos) < realPrecision){
+                                sum += oriData[i];
+                                mean_count ++;
+                        }
+                }
+                if(mean_count > 0) mean = sum / mean_count;
+        }
+        double tmp_realPrecision = realPrecision;
+        // use two prediction buffers for higher performance
+        float * unpredictable_data = result_unpredictable_data;
+        unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+        memset(indicator, 0, num_blocks * sizeof(unsigned char));
+        size_t reg_count = 0;
+        size_t strip_dim_0 = early_blockcount_x + 1;
+        size_t strip_dim_1 = r2 + 1;
+        size_t strip_dim0_offset = strip_dim_1;
+        unsigned char * indicator_pos = indicator;
+        size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(float);
+        float * prediction_buffer_1 = (float *) malloc(prediction_buffer_size);
+        memset(prediction_buffer_1, 0, prediction_buffer_size);
+        float * prediction_buffer_2 = (float *) malloc(prediction_buffer_size);
+        memset(prediction_buffer_2, 0, prediction_buffer_size);
+        float * cur_pb_buf = prediction_buffer_1;
+        float * next_pb_buf = prediction_buffer_2;
+        float * cur_pb_buf_pos;
+        float * next_pb_buf_pos;
+        int intvCapacity = exe_params->intvCapacity;
+        int intvRadius = exe_params->intvRadius;
+        int use_reg = 0;
+        reg_params_pos = reg_params;
+        // compress the regression coefficients on the fly
+        float last_coeffcients[3] = {0.0};
+        int coeff_intvCapacity_sz = 65536;
+        int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+        int * coeff_type[3];
+        int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
+        float * coeff_unpred_data[3];
+        float * coeff_unpredictable_data = (float *) malloc(num_blocks*3*sizeof(float));
+        double precision[3];
+        precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+        for(int i=0; i<3; i++){
+                coeff_type[i] = coeff_result_type + i * num_blocks;
+                coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+        }
+        int coeff_index = 0;
+        unsigned int coeff_unpredictable_count[3] = {0};
+        if(use_mean){
+                type = result_type;
+                int intvCapacity_sz = intvCapacity - 2;
+                for(size_t i=0; i<num_x; i++){
+                        current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+                        offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+                        data_pos = oriData + offset_x * dim0_offset;
+                        cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+                        next_pb_buf_pos = next_pb_buf + 1;
+                        float * pb_pos = cur_pb_buf_pos;
+                        float * next_pb_pos = next_pb_buf_pos;
+                        for(size_t j=0; j<num_y; j++){
+                                offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+                                current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+                                /*sampling: decide which predictor to use (regression or lorenzo)*/
+                                {
+                                        float * cur_data_pos;
+                                        float curData;
+                                        float pred_reg, pred_sz;
+                                        float err_sz = 0.0, err_reg = 0.0;
+                                        // [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+                                        // [1, 9] [3, 7]                [7, 3] [9, 1]
+                                        int count = 0;
+                                        for(int i=1; i<current_blockcount_x; i+=2){
+                                                cur_data_pos = data_pos + i * dim0_offset + i;
+                                                curData = *cur_data_pos;
+                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];
+                                                err_sz += MIN(fabs(pred_sz - curData) + realPrecision*0.81, fabs(mean - curData));
+                                                err_reg += fabs(pred_reg - curData);
+                                                cur_data_pos = data_pos + i * dim0_offset + (block_size - i);
+                                                curData = *cur_data_pos;
+                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * (block_size - i) + reg_params_pos[params_offset_c];
+                                                err_sz += MIN(fabs(pred_sz - curData) + realPrecision*0.81, fabs(mean - curData));
+                                                err_reg += fabs(pred_reg - curData);
+                                                count += 2;
+                                        }
+                                        use_reg = (err_reg < err_sz);
+                                }
+                                if(use_reg)
+                                {
+                                        {
+                                                /*predict coefficients in current block via previous reg_block*/
+                                                float cur_coeff;
+                                                double diff, itvNum;
+                                                for(int e=0; e<3; e++){
+                                                        cur_coeff = reg_params_pos[e*num_blocks];
+                                                        diff = cur_coeff - last_coeffcients[e];
+                                                        itvNum = fabs(diff)/precision[e] + 1;
+                                                        if (itvNum < coeff_intvCapacity_sz){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+                                                                last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){
+                                                                        coeff_type[e][coeff_index] = 0;
+                                                                        last_coeffcients[e] = cur_coeff;
+                                                                        coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                }
+                                                        }
+                                                        else{
+                                                                coeff_type[e][coeff_index] = 0;
+                                                                last_coeffcients[e] = cur_coeff;
+                                                                coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                        }
+                                                }
+                                                coeff_index ++;
+                                        }
+                                        float curData;
+                                        float pred;
+                                        double itvNum;
+                                        double diff;
+                                        size_t index = 0;
+                                        size_t block_unpredictable_count = 0;
+                                        float * cur_data_pos = data_pos;
+                                        for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                /*dealing with the last jj (boundary)*/
+                                                {
+                                                        size_t jj = current_blockcount_y - 1;
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        // assign value to block surfaces
+                                                        pb_pos[ii * strip_dim0_offset + jj] = pred;
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                cur_data_pos += dim0_offset - current_blockcount_y;
+                                        }
+                                        /*dealing with the last ii (boundary)*/
+                                        {
+                                                size_t ii = current_blockcount_x - 1;
+                                                for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        // assign value to next prediction buffer
+                                                        next_pb_pos[jj] = pred;
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                /*dealing with the last jj (boundary)*/
+                                                {
+                                                        size_t jj = current_blockcount_y - 1;
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        // assign value to block surfaces
+                                                        pb_pos[ii * strip_dim0_offset + jj] = pred;
+                                                        // assign value to next prediction buffer
+                                                        next_pb_pos[jj] = pred;
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                        } // end ii == -1
+                                        unpredictable_count = block_unpredictable_count;
+                                        total_unpred += unpredictable_count;
+                                        unpredictable_data += unpredictable_count;
+                                        reg_count ++;
+                                }// end use_reg
+                                else{
+                                        // use SZ
+                                        // SZ predication
+                                        unpredictable_count = 0;
+                                        float * cur_pb_pos = pb_pos;
+                                        float * cur_data_pos = data_pos;
+                                        float curData;
+                                        float pred2D;
+                                        double itvNum, diff;
+                                        size_t index = 0;
+                                        for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                        curData = *cur_data_pos;
+                                                        if(fabs(curData - mean) <= realPrecision){
+                                                                // adjust type[index] to intvRadius for coherence with freq in reg
+                                                                type[index] = intvRadius;
+                                                                *cur_pb_pos = mean;
+                                                        }
+                                                        else
+                                                        {
+                                                                pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+                                                                diff = curData - pred2D;
+                                                                itvNum = fabs(diff)/realPrecision + 1;
+                                                                if (itvNum < intvCapacity_sz){
+                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                        type[index] = (int) (itvNum/2) + intvRadius;
+                                                                        *cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                        if(type[index] <= intvRadius) type[index] -= 1;
+                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                        if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                                type[index] = 0;
+                                                                                *cur_pb_pos = curData;
+                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                        }
+                                                                }
+                                                                else{
+                                                                        type[index] = 0;
+                                                                        *cur_pb_pos = curData;
+                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        index ++;
+                                                        cur_pb_pos ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+                                                cur_data_pos += dim0_offset - current_blockcount_y;
+                                        }
+                                        /*dealing with the last ii (boundary)*/
+                                        {
+                                                // ii == current_blockcount_x - 1
+                                                for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                        curData = *cur_data_pos;
+                                                        if(fabs(curData - mean) <= realPrecision){
+                                                                // adjust type[index] to intvRadius for coherence with freq in reg
+                                                                type[index] = intvRadius;
+                                                                *cur_pb_pos = mean;
+                                                        }
+                                                        else
+                                                        {
+                                                                pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+                                                                diff = curData - pred2D;
+                                                                itvNum = fabs(diff)/realPrecision + 1;
+                                                                if (itvNum < intvCapacity_sz){
+                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                        type[index] = (int) (itvNum/2) + intvRadius;
+                                                                        *cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                        if(type[index] <= intvRadius) type[index] -= 1;
+                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                        if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                                type[index] = 0;
+                                                                                *cur_pb_pos = curData;
+                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                        }
+                                                                }
+                                                                else{
+                                                                        type[index] = 0;
+                                                                        *cur_pb_pos = curData;
+                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        next_pb_pos[jj] = *cur_pb_pos;
+                                                        index ++;
+                                                        cur_pb_pos ++;
+                                                        cur_data_pos ++;
+                                                }
+                                        }
+                                        total_unpred += unpredictable_count;
+                                        unpredictable_data += unpredictable_count;
+                                        // change indicator
+                                        indicator_pos[j] = 1;
+                                }// end SZ
+                                reg_params_pos ++;
+                                data_pos += current_blockcount_y;
+                                pb_pos += current_blockcount_y;
+                                next_pb_pos += current_blockcount_y;
+                                type += current_blockcount_x * current_blockcount_y;
+                        }// end j
+                        indicator_pos += num_y;
+                        float * tmp;
+                        tmp = cur_pb_buf;
+                        cur_pb_buf = next_pb_buf;
+                        next_pb_buf = tmp;
+                }// end i
+        }// end use mean
+        else{
+                type = result_type;
+                int intvCapacity_sz = intvCapacity - 2;
+                for(size_t i=0; i<num_x; i++){
+                        current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+                        offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+                        data_pos = oriData + offset_x * dim0_offset;
+                        cur_pb_buf_pos = cur_pb_buf + strip_dim0_offset + 1;
+                        next_pb_buf_pos = next_pb_buf + 1;
+                        float * pb_pos = cur_pb_buf_pos;
+                        float * next_pb_pos = next_pb_buf_pos;
+                        for(size_t j=0; j<num_y; j++){
+                                offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+                                current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+                                /*sampling*/
+                                {
+                                        // sample [2i + 1, 2i + 1] [2i + 1, bs - 2i]
+                                        float * cur_data_pos;
+                                        float curData;
+                                        float pred_reg, pred_sz;
+                                        float err_sz = 0.0, err_reg = 0.0;
+                                        // [1, 1] [3, 3] [5, 5] [7, 7] [9, 9]
+                                        // [1, 9] [3, 7]                [7, 3] [9, 1]
+                                        int count = 0;
+                                        for(int i=1; i<current_blockcount_x; i+=2){
+                                                cur_data_pos = data_pos + i * dim0_offset + i;
+                                                curData = *cur_data_pos;
+                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c];
+                                                err_sz += fabs(pred_sz - curData);
+                                                err_reg += fabs(pred_reg - curData);
+                                                cur_data_pos = data_pos + i * dim0_offset + (block_size - i);
+                                                curData = *cur_data_pos;
+                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim0_offset] - cur_data_pos[-dim0_offset - 1];
+                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * (block_size - i) + reg_params_pos[params_offset_c];
+                                                err_sz += fabs(pred_sz - curData);
+                                                err_reg += fabs(pred_reg - curData);
+                                                count += 2;
+                                        }
+                                        err_sz += realPrecision * count * 0.81;
+                                        use_reg = (err_reg < err_sz);
+                                }
+                                if(use_reg)
+                                {
+                                        {
+                                                /*predict coefficients in current block via previous reg_block*/
+                                                float cur_coeff;
+                                                double diff, itvNum;
+                                                for(int e=0; e<3; e++){
+                                                        cur_coeff = reg_params_pos[e*num_blocks];
+                                                        diff = cur_coeff - last_coeffcients[e];
+                                                        itvNum = fabs(diff)/precision[e] + 1;
+                                                        if (itvNum < coeff_intvCapacity_sz){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+                                                                last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){
+                                                                        coeff_type[e][coeff_index] = 0;
+                                                                        last_coeffcients[e] = cur_coeff;
+                                                                        coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                }
+                                                        }
+                                                        else{
+                                                                coeff_type[e][coeff_index] = 0;
+                                                                last_coeffcients[e] = cur_coeff;
+                                                                coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                        }
+                                                }
+                                                coeff_index ++;
+                                        }
+                                        float curData;
+                                        float pred;
+                                        double itvNum;
+                                        double diff;
+                                        size_t index = 0;
+                                        size_t block_unpredictable_count = 0;
+                                        float * cur_data_pos = data_pos;
+                                        for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                /*dealing with the last jj (boundary)*/
+                                                {
+                                                        // jj == current_blockcount_y - 1
+                                                        size_t jj = current_blockcount_y - 1;
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        // assign value to block surfaces
+                                                        pb_pos[ii * strip_dim0_offset + jj] = pred;
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                cur_data_pos += dim0_offset - current_blockcount_y;
+                                        }
+                                        /*dealing with the last ii (boundary)*/
+                                        {
+                                                size_t ii = current_blockcount_x - 1;
+                                                for(size_t jj=0; jj<current_blockcount_y - 1; jj++){
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        // assign value to next prediction buffer
+                                                        next_pb_pos[jj] = pred;
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                /*dealing with the last jj (boundary)*/
+                                                {
+                                                        // jj == current_blockcount_y - 1
+                                                        size_t jj = current_blockcount_y - 1;
+                                                        curData = *cur_data_pos;
+                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
+                                                        diff = curData - pred;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - pred)>realPrecision){
+                                                                        type[index] = 0;
+                                                                        pred = curData;
+                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                pred = curData;
+                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                        }
+                                                        // assign value to block surfaces
+                                                        pb_pos[ii * strip_dim0_offset + jj] = pred;
+                                                        // assign value to next prediction buffer
+                                                        next_pb_pos[jj] = pred;
+                                                        index ++;
+                                                        cur_data_pos ++;
+                                                }
+                                        } // end ii == -1
+                                        unpredictable_count = block_unpredictable_count;
+                                        total_unpred += unpredictable_count;
+                                        unpredictable_data += unpredictable_count;
+                                        reg_count ++;
+                                }// end use_reg
+                                else{
+                                        // use SZ
+                                        // SZ predication
+                                        unpredictable_count = 0;
+                                        float * cur_pb_pos = pb_pos;
+                                        float * cur_data_pos = data_pos;
+                                        float curData;
+                                        float pred2D;
+                                        double itvNum, diff;
+                                        size_t index = 0;
+                                        for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                        curData = *cur_data_pos;
+                                                        pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+                                                        diff = curData - pred2D;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity_sz){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                *cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                        type[index] = 0;
+                                                                        *cur_pb_pos = curData;
+                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                *cur_pb_pos = curData;
+                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                        }
+                                                        index ++;
+                                                        cur_pb_pos ++;
+                                                        cur_data_pos ++;
+                                                }
+                                                cur_pb_pos += strip_dim0_offset - current_blockcount_y;
+                                                cur_data_pos += dim0_offset - current_blockcount_y;
+                                        }
+                                        /*dealing with the last ii (boundary)*/
+                                        {
+                                                // ii == current_blockcount_x - 1
+                                                for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                        curData = *cur_data_pos;
+                                                        pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
+                                                        diff = curData - pred2D;
+                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                        if (itvNum < intvCapacity_sz){
+                                                                if (diff < 0) itvNum = -itvNum;
+                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                *cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                        type[index] = 0;
+                                                                        *cur_pb_pos = curData;
+                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                }
+                                                        }
+                                                        else{
+                                                                type[index] = 0;
+                                                                *cur_pb_pos = curData;
+                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                        }
+                                                        next_pb_pos[jj] = *cur_pb_pos;
+                                                        index ++;
+                                                        cur_pb_pos ++;
+                                                        cur_data_pos ++;
+                                                }
+                                        }
+                                        total_unpred += unpredictable_count;
+                                        unpredictable_data += unpredictable_count;
+                                        // change indicator
+                                        indicator_pos[j] = 1;
+                                }// end SZ
+                                reg_params_pos ++;
+                                data_pos += current_blockcount_y;
+                                pb_pos += current_blockcount_y;
+                                next_pb_pos += current_blockcount_y;
+                                type += current_blockcount_x * current_blockcount_y;
+                        }// end j
+                        indicator_pos += num_y;
+                        float * tmp;
+                        tmp = cur_pb_buf;
+                        cur_pb_buf = next_pb_buf;
+                        next_pb_buf = tmp;
+                }// end i
+        }
+        free(prediction_buffer_1);
+        free(prediction_buffer_2);
+        int stateNum = 2*quantization_intervals;
+        HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+        size_t nodeCount = 0;
+        size_t i = 0;
+        init(huffmanTree, result_type, num_elements);
+        for (i = 0; i < stateNum; i++)
+                if (huffmanTree->code[i]) nodeCount++;
+        nodeCount = nodeCount*2-1;
+        unsigned char *treeBytes;
+        unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+        unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+        // total size                                                                           metadata                  # elements   real precision           intervals       nodeCount               huffman                 block index                                             unpredicatable count                                            mean                                            unpred size                             elements
+        unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+        unsigned char * result_pos = result;
+        initRandomAccessBytes(result_pos);
+        result_pos += meta_data_offset;
+        sizeToBytes(result_pos, num_elements);
+        result_pos += exe_params->SZ_SIZE_TYPE;
+        intToBytes_bigEndian(result_pos, block_size);
+        result_pos += sizeof(int);
+        doubleToBytes(result_pos, realPrecision);
+        result_pos += sizeof(double);
+        intToBytes_bigEndian(result_pos, quantization_intervals);
+        result_pos += sizeof(int);
+        intToBytes_bigEndian(result_pos, treeByteSize);
+        result_pos += sizeof(int);
+        intToBytes_bigEndian(result_pos, nodeCount);
+        result_pos += sizeof(int);
+        memcpy(result_pos, treeBytes, treeByteSize);
+        result_pos += treeByteSize;
+        free(treeBytes);
+        memcpy(result_pos, &use_mean, sizeof(unsigned char));
+        result_pos += sizeof(unsigned char);
+        memcpy(result_pos, &mean, sizeof(float));
+        result_pos += sizeof(float);
+        size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+        result_pos += indicator_size;
+        //convert the lead/mid/resi to byte stream
+        if(reg_count>0){
+                for(int e=0; e<3; e++){
+                        int stateNum = 2*coeff_intvCapacity_sz;
+                        HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+                        size_t nodeCount = 0;
+                        init(huffmanTree, coeff_type[e], reg_count);
+                        size_t i = 0;
+                        for (i = 0; i < huffmanTree->stateNum; i++)
+                                if (huffmanTree->code[i]) nodeCount++;
+                        nodeCount = nodeCount*2-1;
+                        unsigned char *treeBytes;
+                        unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+                        doubleToBytes(result_pos, precision[e]);
+                        result_pos += sizeof(double);
+                        intToBytes_bigEndian(result_pos, coeff_intvRadius);
+                        result_pos += sizeof(int);
+                        intToBytes_bigEndian(result_pos, treeByteSize);
+                        result_pos += sizeof(int);
+                        intToBytes_bigEndian(result_pos, nodeCount);
+                        result_pos += sizeof(int);
+                        memcpy(result_pos, treeBytes, treeByteSize);
+                        result_pos += treeByteSize;
+                        free(treeBytes);
+                        size_t typeArray_size = 0;
+                        encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+                        sizeToBytes(result_pos, typeArray_size);
+                        result_pos += sizeof(size_t) + typeArray_size;
+                        intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+                        result_pos += sizeof(int);
+                        memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+                        result_pos += coeff_unpredictable_count[e]*sizeof(float);
+                        SZ_ReleaseHuffman(huffmanTree);
+                }
+        }
+        free(coeff_result_type);
+        free(coeff_unpredictable_data);
+        //record the number of unpredictable data and also store them
+        memcpy(result_pos, &total_unpred, sizeof(size_t));
+        result_pos += sizeof(size_t);
+        memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+        result_pos += total_unpred * sizeof(float);
+        size_t typeArray_size = 0;
+        encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+        result_pos += typeArray_size;
+        size_t totalEncodeSize = result_pos - result;
+        free(indicator);
+        free(result_unpredictable_data);
+        free(result_type);
+        free(reg_params);
+        SZ_ReleaseHuffman(huffmanTree);
+        *comp_size = totalEncodeSize;
+        return result;
+}
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq)
+{
+        float mean = 0.0;
+        size_t len = r1 * r2 * r3;
+        size_t mean_distance = (int) (sqrt(len));
+        float * data_pos = oriData;
+        size_t offset_count = 0;
+        size_t offset_count_2 = 0;
+        size_t mean_count = 0;
+        while(data_pos - oriData < len){
+                mean += *data_pos;
+                mean_count ++;
+                data_pos += mean_distance;
+                offset_count += mean_distance;
+                offset_count_2 += mean_distance;
+                if(offset_count >= r3){
+                        offset_count = 0;
+                        data_pos -= 1;
+                }
+                if(offset_count_2 >= r2 * r3){
+                        offset_count_2 = 0;
+                        data_pos -= 1;
+                }
+        }
+        if(mean_count > 0) mean /= mean_count;
+        size_t range = 8192;
+        size_t radius = 4096;
+        size_t * freq_intervals = (size_t *) malloc(range*sizeof(size_t));
+        memset(freq_intervals, 0, range*sizeof(size_t));
+        unsigned int maxRangeRadius = confparams_cpr->maxRangeRadius;
+        int sampleDistance = confparams_cpr->sampleDistance;
+        float predThreshold = confparams_cpr->predThreshold;
+        size_t i;
+        size_t radiusIndex;
+        size_t r23=r2*r3;
+        float pred_value = 0, pred_err;
+        size_t *intervals = (size_t*)malloc(maxRangeRadius*sizeof(size_t));
+        memset(intervals, 0, maxRangeRadius*sizeof(size_t));
+        float mean_diff;
+        ptrdiff_t freq_index;
+        size_t freq_count = 0;
+        size_t sample_count = 0;
+        offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+        data_pos = oriData + r23 + r3 + offset_count;
+        size_t n1_count = 1, n2_count = 1; // count i,j sum
+        while(data_pos - oriData < len){
+                pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+                pred_err = fabs(pred_value - *data_pos);
+                if(pred_err < realPrecision) freq_count ++;
+                radiusIndex = (pred_err/realPrecision+1)/2;
+                if(radiusIndex>=maxRangeRadius)
+                {
+                        radiusIndex = maxRangeRadius - 1;
+                }
+                intervals[radiusIndex]++;
+                mean_diff = *data_pos - mean;
+                if(mean_diff > 0) freq_index = (ptrdiff_t)(mean_diff/realPrecision) + radius;
+                else freq_index = (ptrdiff_t)(mean_diff/realPrecision) - 1 + radius;
+                if(freq_index <= 0){
+                        freq_intervals[0] ++;
+                }
+                else if(freq_index >= range){
+                        freq_intervals[range - 1] ++;
+                }
+                else{
+                        freq_intervals[freq_index] ++;
+                }
+                offset_count += sampleDistance;
+                if(offset_count >= r3){
+                        n2_count ++;
+                        if(n2_count == r2){
+                                n1_count ++;
+                                n2_count = 1;
+                                data_pos += r3;
+                        }
+                        offset_count_2 = (n1_count + n2_count) % sampleDistance;
+                        data_pos += (r3 + sampleDistance - offset_count) + (sampleDistance - offset_count_2);
+                        offset_count = (sampleDistance - offset_count_2);
+                        if(offset_count == 0) offset_count ++;
+                }
+                else data_pos += sampleDistance;
+                sample_count ++;
+        }
+        *max_freq = freq_count * 1.0/ sample_count;
+        //compute the appropriate number
+        size_t targetCount = sample_count*predThreshold;
+        size_t sum = 0;
+        for(i=0;i<maxRangeRadius;i++)
+        {
+                sum += intervals[i];
+                if(sum>targetCount)
+                        break;
+        }
+        if(i>=maxRangeRadius)
+                i = maxRangeRadius-1;
+        unsigned int accIntervals = 2*(i+1);
+        unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+        if(powerOf2<32)
+                powerOf2 = 32;
+        // collect frequency
+        size_t max_sum = 0;
+        size_t max_index = 0;
+        size_t tmp_sum;
+        size_t * freq_pos = freq_intervals + 1;
+        for(size_t i=1; i<range-2; i++){
+                tmp_sum = freq_pos[0] + freq_pos[1];
+                if(tmp_sum > max_sum){
+                        max_sum = tmp_sum;
+                        max_index = i;
+                }
+                freq_pos ++;
+        }
+        *dense_pos = mean + realPrecision * (ptrdiff_t)(max_index + 1 - radius);
+        *mean_freq = max_sum * 1.0 / sample_count;
+        free(freq_intervals);
+        free(intervals);
+        return powerOf2;
+}
+// 3D:  modified for higher performance
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+#ifdef HAVE_TIMECMPR
+        float* decData = NULL;
+        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                decData = (float*)(multisteps->hist_data);
+#endif
+        unsigned int quantization_intervals;
+        float sz_sample_correct_freq = -1;//0.5; //-1
+        float dense_pos;
+        float mean_flush_freq;
+        unsigned char use_mean = 0;
+        // calculate block dims
+        size_t num_x, num_y, num_z;
+        size_t block_size = 6;
+        SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r1, num_x, block_size);
+        SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r2, num_y, block_size);
+        SZ_COMPUTE_3D_NUMBER_OF_BLOCKS(r3, num_z, block_size);
+        size_t split_index_x, split_index_y, split_index_z;
+        size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+        size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+        SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+        SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+        SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+        size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+        size_t num_blocks = num_x * num_y * num_z;
+        size_t num_elements = r1 * r2 * r3;
+        size_t dim0_offset = r2 * r3;
+        size_t dim1_offset = r3;
+        int * result_type = (int *) malloc(num_elements * sizeof(int));
+        size_t unpred_data_max_size = max_num_block_elements;
+        float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+        size_t total_unpred = 0;
+        size_t unpredictable_count;
+        size_t max_unpred_count = 0;
+        float * data_pos = oriData;
+        int * type = result_type;
+        size_t type_offset;
+        size_t offset_x, offset_y, offset_z;
+        size_t current_blockcount_x, current_blockcount_y, current_blockcount_z;
+        float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+        float * reg_params_pos = reg_params;
+        // move regression part out
+        size_t params_offset_b = num_blocks;
+        size_t params_offset_c = 2*num_blocks;
+        size_t params_offset_d = 3*num_blocks;
+        for(size_t i=0; i<num_x; i++){
+                for(size_t j=0; j<num_y; j++){
+                        for(size_t k=0; k<num_z; k++){
+                                current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+                                current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+                                current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+                                offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+                                offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+                                offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+                                data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+                                /*Calculate regression coefficients*/
+                                {
+                                        float * cur_data_pos = data_pos;
+                                        float fx = 0.0;
+                                        float fy = 0.0;
+                                        float fz = 0.0;
+                                        float f = 0;
+                                        float sum_x, sum_y;
+                                        float curData;
+                                        for(size_t i=0; i<current_blockcount_x; i++){
+                                                sum_x = 0;
+                                                for(size_t j=0; j<current_blockcount_y; j++){
+                                                        sum_y = 0;
+                                                        for(size_t k=0; k<current_blockcount_z; k++){
+                                                                curData = *cur_data_pos;
+                                                                // f += curData;
+                                                                // fx += curData * i;
+                                                                // fy += curData * j;
+                                                                // fz += curData * k;
+                                                                sum_y += curData;
+                                                                fz += curData * k;
+                                                                cur_data_pos ++;
+                                                        }
+                                                        fy += sum_y * j;
+                                                        sum_x += sum_y;
+                                                        cur_data_pos += dim1_offset - current_blockcount_z;
+                                                }
+                                                fx += sum_x * i;
+                                                f += sum_x;
+                                                cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+                                        }
+                                        float coeff = 1.0 / (current_blockcount_x * current_blockcount_y * current_blockcount_z);
+                                        reg_params_pos[0] = (2 * fx / (current_blockcount_x - 1) - f) * 6 * coeff / (current_blockcount_x + 1);
+                                        reg_params_pos[params_offset_b] = (2 * fy / (current_blockcount_y - 1) - f) * 6 * coeff / (current_blockcount_y + 1);
+                                        reg_params_pos[params_offset_c] = (2 * fz / (current_blockcount_z - 1) - f) * 6 * coeff / (current_blockcount_z + 1);
+                                        reg_params_pos[params_offset_d] = f * coeff - ((current_blockcount_x - 1) * reg_params_pos[0] / 2 + (current_blockcount_y - 1) * reg_params_pos[params_offset_b] / 2 + (current_blockcount_z - 1) * reg_params_pos[params_offset_c] / 2);
+                                }
+                                reg_params_pos ++;
+                        }
+                }
+        }
+        //Compress coefficient arrays
+        double precision_a, precision_b, precision_c, precision_d;
+        float rel_param_err = 0.025;
+        precision_a = rel_param_err * realPrecision / late_blockcount_x;
+        precision_b = rel_param_err * realPrecision / late_blockcount_y;
+        precision_c = rel_param_err * realPrecision / late_blockcount_z;
+        precision_d = rel_param_err * realPrecision;
+        if(exe_params->optQuantMode==1)
+        {
+                quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+                if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+                updateQuantizationInfo(quantization_intervals);
+        }
+        else{
+                quantization_intervals = exe_params->intvCapacity;
+        }
+        float mean = 0;
+        if(use_mean){
+                // compute mean
+                double sum = 0.0;
+                size_t mean_count = 0;
+                for(size_t i=0; i<num_elements; i++){
+                        if(fabs(oriData[i] - dense_pos) < realPrecision){
+                                sum += oriData[i];
+                                mean_count ++;
+                        }
+                }
+                if(mean_count > 0) mean = sum / mean_count;
+        }
+        double tmp_realPrecision = realPrecision;
+        // use two prediction buffers for higher performance
+        float * unpredictable_data = result_unpredictable_data;
+        unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+        memset(indicator, 0, num_blocks * sizeof(unsigned char));
+        size_t reg_count = 0;
+        size_t strip_dim_0 = early_blockcount_x + 1;
+        size_t strip_dim_1 = r2 + 1;
+        size_t strip_dim_2 = r3 + 1;
+        size_t strip_dim0_offset = strip_dim_1 * strip_dim_2;
+        size_t strip_dim1_offset = strip_dim_2;
+        unsigned char * indicator_pos = indicator;
+        size_t prediction_buffer_size = strip_dim_0 * strip_dim0_offset * sizeof(float);
+        float * prediction_buffer_1 = (float *) malloc(prediction_buffer_size);
+        memset(prediction_buffer_1, 0, prediction_buffer_size);
+        float * prediction_buffer_2 = (float *) malloc(prediction_buffer_size);
+        memset(prediction_buffer_2, 0, prediction_buffer_size);
+        float * cur_pb_buf = prediction_buffer_1;
+        float * next_pb_buf = prediction_buffer_2;
+        float * cur_pb_buf_pos;
+        float * next_pb_buf_pos;
+        int intvCapacity = exe_params->intvCapacity;
+        int intvRadius = exe_params->intvRadius;
+        int use_reg = 0;
+        float noise = realPrecision * 1.22;
+        reg_params_pos = reg_params;
+        // compress the regression coefficients on the fly
+        float last_coeffcients[4] = {0.0};
+        int coeff_intvCapacity_sz = 65536;
+        int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+        int * coeff_type[4];
+        int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+        float * coeff_unpred_data[4];
+        float * coeff_unpredictable_data = (float *) malloc(num_blocks*4*sizeof(float));
+        double precision[4];
+        precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+        for(int i=0; i<4; i++){
+                coeff_type[i] = coeff_result_type + i * num_blocks;
+                coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+        }
+        int coeff_index = 0;
+        unsigned int coeff_unpredictable_count[4] = {0};
+        if(use_mean){
+                int intvCapacity_sz = intvCapacity - 2;
+                for(size_t i=0; i<num_x; i++){
+                        current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+                        offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+                        for(size_t j=0; j<num_y; j++){
+                                offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+                                current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+                                data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+                                type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+                                type = result_type + type_offset;
+                                // prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+                                cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+                                next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+                                size_t current_blockcount_z;
+                                float * pb_pos = cur_pb_buf_pos;
+                                float * next_pb_pos = next_pb_buf_pos;
+                                size_t strip_unpredictable_count = 0;
+                                for(size_t k=0; k<num_z; k++){
+                                        current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+                                        size_t offset_z = 0;
+                                        offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+                                        size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif
+                                        /*sampling and decide which predictor*/
+                                        {
+                                                // sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+                                                float * cur_data_pos;
+                                                float curData;
+                                                float pred_reg, pred_sz;
+                                                float err_sz = 0.0, err_reg = 0.0;
+                                                int bmi = 0;
+                                                if(i>0 && j>0 && k>0){
+                                                        for(int i=0; i<block_size; i++){
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                bmi = block_size - i;
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                        }
+                                                }
+                                                else{
+                                                        for(int i=1; i<block_size; i++){
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                bmi = block_size - i;
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                                err_reg += fabs(pred_reg - curData);
+                                                        }
+                                                }
+                                                use_reg = (err_reg < err_sz);
+                                        }
+                                        if(use_reg){
+                                                {
+                                                        /*predict coefficients in current block via previous reg_block*/
+                                                        float cur_coeff;
+                                                        double diff, itvNum;
+                                                        for(int e=0; e<4; e++){
+                                                                cur_coeff = reg_params_pos[e*num_blocks];
+                                                                diff = cur_coeff - last_coeffcients[e];
+                                                                itvNum = fabs(diff)/precision[e] + 1;
+                                                                if (itvNum < coeff_intvCapacity_sz){
+                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                        coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+                                                                        last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                        if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){
+                                                                                coeff_type[e][coeff_index] = 0;
+                                                                                last_coeffcients[e] = cur_coeff;
+                                                                                coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                        }
+                                                                }
+                                                                else{
+                                                                        coeff_type[e][coeff_index] = 0;
+                                                                        last_coeffcients[e] = cur_coeff;
+                                                                        coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                }
+                                                        }
+                                                        coeff_index ++;
+                                                }
+                                                float curData;
+                                                float pred;
+                                                double itvNum;
+                                                double diff;
+                                                size_t index = 0;
+                                                size_t block_unpredictable_count = 0;
+                                                float * cur_data_pos = data_pos;
+                                                for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];
+                                                                        diff = curData - pred;
+                                                                        itvNum = fabs(diff)/tmp_realPrecision + 1;
+                                                                        if (itvNum < intvCapacity){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - pred)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        pred = curData;
+                                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                pred = curData;
+                                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = pred;
+#endif
+                                                                        if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+                                                                                // assign value to block surfaces
+                                                                                pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+                                                                        }
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                        cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+                                                }
+                                                /*dealing with the last ii (boundary)*/
+                                                {
+                                                        // ii == current_blockcount_x - 1
+                                                        size_t ii = current_blockcount_x - 1;
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];
+                                                                        diff = curData - pred;
+                                                                        itvNum = fabs(diff)/tmp_realPrecision + 1;
+                                                                        if (itvNum < intvCapacity){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - pred)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        pred = curData;
+                                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                pred = curData;
+                                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = pred;
+#endif
+                                                                        if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+                                                                                // assign value to block surfaces
+                                                                                pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+                                                                        }
+                                                                        // assign value to next prediction buffer
+                                                                        next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                }
+                                                unpredictable_count = block_unpredictable_count;
+                                                strip_unpredictable_count += unpredictable_count;
+                                                unpredictable_data += unpredictable_count;
+                                                reg_count ++;
+                                        }
+                                        else{
+                                                // use SZ
+                                                // SZ predication
+                                                unpredictable_count = 0;
+                                                float * cur_pb_pos = pb_pos;
+                                                float * cur_data_pos = data_pos;
+                                                float curData;
+                                                float pred3D;
+                                                double itvNum, diff;
+                                                size_t index = 0;
+                                                for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        if(fabs(curData - mean) <= realPrecision){
+                                                                                // adjust type[index] to intvRadius for coherence with freq in reg
+                                                                                type[index] = intvRadius;
+                                                                                *cur_pb_pos = mean;
+                                                                        }
+                                                                        else
+                                                                        {
+                                                                                pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+                                                                                                 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                                                diff = curData - pred3D;
+                                                                                itvNum = fabs(diff)/realPrecision + 1;
+                                                                                if (itvNum < intvCapacity_sz){
+                                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                                        type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                        *cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                        if(type[index] <= intvRadius) type[index] -= 1;
+                                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                                        if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                                                type[index] = 0;
+                                                                                                *cur_pb_pos = curData;
+                                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                                        }
+                                                                                }
+                                                                                else{
+                                                                                        type[index] = 0;
+                                                                                        *cur_pb_pos = curData;
+                                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = *cur_pb_pos;
+#endif
+                                                                        index ++;
+                                                                        cur_pb_pos ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                        cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+                                                        cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+                                                }
+                                                /*dealing with the last ii (boundary)*/
+                                                {
+                                                        // ii == current_blockcount_x - 1
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        if(fabs(curData - mean) <= realPrecision){
+                                                                                // adjust type[index] to intvRadius for coherence with freq in reg
+                                                                                type[index] = intvRadius;
+                                                                                *cur_pb_pos = mean;
+                                                                        }
+                                                                        else
+                                                                        {
+                                                                                pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+                                                                                                 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                                                diff = curData - pred3D;
+                                                                                itvNum = fabs(diff)/realPrecision + 1;
+                                                                                if (itvNum < intvCapacity_sz){
+                                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                                        type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                        *cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                        if(type[index] <= intvRadius) type[index] -= 1;
+                                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                                        if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                                                type[index] = 0;
+                                                                                                *cur_pb_pos = curData;
+                                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                                        }
+                                                                                }
+                                                                                else{
+                                                                                        type[index] = 0;
+                                                                                        *cur_pb_pos = curData;
+                                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t ii = current_blockcount_x - 1;
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = *cur_pb_pos;
+#endif
+                                                                        next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+                                                                        index ++;
+                                                                        cur_pb_pos ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                }
+                                                strip_unpredictable_count += unpredictable_count;
+                                                unpredictable_data += unpredictable_count;
+                                                // change indicator
+                                                indicator_pos[k] = 1;
+                                        }// end SZ
+                                        reg_params_pos ++;
+                                        data_pos += current_blockcount_z;
+                                        pb_pos += current_blockcount_z;
+                                        next_pb_pos += current_blockcount_z;
+                                        type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+                                } // end k
+                                if(strip_unpredictable_count > max_unpred_count){
+                                        max_unpred_count = strip_unpredictable_count;
+                                }
+                                total_unpred += strip_unpredictable_count;
+                                indicator_pos += num_z;
+                        }// end j
+                        float * tmp;
+                        tmp = cur_pb_buf;
+                        cur_pb_buf = next_pb_buf;
+                        next_pb_buf = tmp;
+                }// end i
+        }
+        else{
+                int intvCapacity_sz = intvCapacity - 2;
+                for(size_t i=0; i<num_x; i++){
+                        current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+                        offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+                        for(size_t j=0; j<num_y; j++){
+                                offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+                                current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+                                data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset;
+                                // copy bottom plane from plane buffer
+                                // memcpy(prediction_buffer, bottom_buffer + offset_y * strip_dim1_offset, (current_blockcount_y + 1) * strip_dim1_offset * sizeof(float));
+                                type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset;
+                                type = result_type + type_offset;
+                                // prediction buffer is (current_block_count_x + 1) * (current_block_count_y + 1) * (current_block_count_z + 1)
+                                cur_pb_buf_pos = cur_pb_buf + offset_y * strip_dim1_offset + strip_dim0_offset + strip_dim1_offset + 1;
+                                next_pb_buf_pos = next_pb_buf + offset_y * strip_dim1_offset + strip_dim1_offset + 1;
+                                size_t current_blockcount_z;
+                                float * pb_pos = cur_pb_buf_pos;
+                                float * next_pb_pos = next_pb_buf_pos;
+                                size_t strip_unpredictable_count = 0;
+                                for(size_t k=0; k<num_z; k++){
+                                        current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+#ifdef HAVE_TIMECMPR
+                                size_t offset_z = 0;
+                                offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+                                size_t block_offset = offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+#endif
+                                        /*sampling*/
+                                        {
+                                                // sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+                                                float * cur_data_pos;
+                                                float curData;
+                                                float pred_reg, pred_sz;
+                                                float err_sz = 0.0, err_reg = 0.0;
+                                                int bmi;
+                                                if(i>0 && j>0 && k>0){
+                                                        for(int i=0; i<block_size; i++){
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                bmi = block_size - i;
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                        }
+                                                }
+                                                else{
+                                                        for(int i=1; i<block_size; i++){
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                bmi = block_size - i;
+                                                                cur_data_pos = data_pos + i*dim0_offset + i*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + i;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                                cur_data_pos = data_pos + i*dim0_offset + bmi*dim1_offset + bmi;
+                                                                curData = *cur_data_pos;
+                                                                pred_sz = cur_data_pos[-1] + cur_data_pos[-dim1_offset]+ cur_data_pos[-dim0_offset] - cur_data_pos[-dim1_offset - 1] - cur_data_pos[-dim0_offset - 1] - cur_data_pos[-dim0_offset - dim1_offset] + cur_data_pos[-dim0_offset - dim1_offset - 1];
+                                                                pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                                err_sz += fabs(pred_sz - curData) + noise;
+                                                                err_reg += fabs(pred_reg - curData);
+                                                        }
+                                                }
+                                                use_reg = (err_reg < err_sz);
+                                        }
+                                        if(use_reg)
+                                        {
+                                                {
+                                                        /*predict coefficients in current block via previous reg_block*/
+                                                        float cur_coeff;
+                                                        double diff, itvNum;
+                                                        for(int e=0; e<4; e++){
+                                                                cur_coeff = reg_params_pos[e*num_blocks];
+                                                                diff = cur_coeff - last_coeffcients[e];
+                                                                itvNum = fabs(diff)/precision[e] + 1;
+                                                                if (itvNum < coeff_intvCapacity_sz){
+                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                        coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+                                                                        last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                        if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){
+                                                                                coeff_type[e][coeff_index] = 0;
+                                                                                last_coeffcients[e] = cur_coeff;
+                                                                                coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                        }
+                                                                }
+                                                                else{
+                                                                        coeff_type[e][coeff_index] = 0;
+                                                                        last_coeffcients[e] = cur_coeff;
+                                                                        coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                }
+                                                        }
+                                                        coeff_index ++;
+                                                }
+                                                float curData;
+                                                float pred;
+                                                double itvNum;
+                                                double diff;
+                                                size_t index = 0;
+                                                size_t block_unpredictable_count = 0;
+                                                float * cur_data_pos = data_pos;
+                                                for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];
+                                                                        diff = curData - pred;
+                                                                        itvNum = fabs(diff)/tmp_realPrecision + 1;
+                                                                        if (itvNum < intvCapacity){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - pred)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        pred = curData;
+                                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                pred = curData;
+                                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = pred;
+#endif
+                                                                        if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+                                                                                // assign value to block surfaces
+                                                                                pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+                                                                        }
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                        cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+                                                }
+                                                /*dealing with the last ii (boundary)*/
+                                                {
+                                                        // ii == current_blockcount_x - 1
+                                                        size_t ii = current_blockcount_x - 1;
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];
+                                                                        diff = curData - pred;
+                                                                        itvNum = fabs(diff)/tmp_realPrecision + 1;
+                                                                        if (itvNum < intvCapacity){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - pred)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        pred = curData;
+                                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                pred = curData;
+                                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = pred;
+#endif
+                                                                        if((jj == current_blockcount_y - 1) || (kk == current_blockcount_z - 1)){
+                                                                                // assign value to block surfaces
+                                                                                pb_pos[ii * strip_dim0_offset + jj * strip_dim1_offset + kk] = pred;
+                                                                        }
+                                                                        // assign value to next prediction buffer
+                                                                        next_pb_pos[jj * strip_dim1_offset + kk] = pred;
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                }
+                                                unpredictable_count = block_unpredictable_count;
+                                                strip_unpredictable_count += unpredictable_count;
+                                                unpredictable_data += unpredictable_count;
+                                                reg_count ++;
+                                        }
+                                        else{
+                                                // use SZ
+                                                // SZ predication
+                                                unpredictable_count = 0;
+                                                float * cur_pb_pos = pb_pos;
+                                                float * cur_data_pos = data_pos;
+                                                float curData;
+                                                float pred3D;
+                                                double itvNum, diff;
+                                                size_t index = 0;
+                                                for(size_t ii=0; ii<current_blockcount_x - 1; ii++){
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+                                                                                         - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                                        diff = curData - pred3D;
+                                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                                        if (itvNum < intvCapacity_sz){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                *cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        *cur_pb_pos = curData;
+                                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                *cur_pb_pos = curData;
+                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = *cur_pb_pos;
+#endif
+                                                                        index ++;
+                                                                        cur_pb_pos ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                        cur_pb_pos += strip_dim0_offset - current_blockcount_y * strip_dim1_offset;
+                                                        cur_data_pos += dim0_offset - current_blockcount_y * dim1_offset;
+                                                }
+                                                /*dealing with the last ii (boundary)*/
+                                                {
+                                                        // ii == current_blockcount_x - 1
+                                                        for(size_t jj=0; jj<current_blockcount_y; jj++){
+                                                                for(size_t kk=0; kk<current_blockcount_z; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
+                                                                                         - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                                        diff = curData - pred3D;
+                                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                                        if (itvNum < intvCapacity_sz){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                *cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        *cur_pb_pos = curData;
+                                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                *cur_pb_pos = curData;
+                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                        }
+#ifdef HAVE_TIMECMPR
+                                                                        size_t ii = current_blockcount_x - 1;
+                                                                        size_t point_offset = ii*dim0_offset + jj*dim1_offset + kk;
+                                                                        if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+                                                                                decData[block_offset + point_offset] = *cur_pb_pos;
+#endif
+                                                                        // assign value to next prediction buffer
+                                                                        next_pb_pos[jj * strip_dim1_offset + kk] = *cur_pb_pos;
+                                                                        index ++;
+                                                                        cur_pb_pos ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_pb_pos += strip_dim1_offset - current_blockcount_z;
+                                                                cur_data_pos += dim1_offset - current_blockcount_z;
+                                                        }
+                                                }
+                                                strip_unpredictable_count += unpredictable_count;
+                                                unpredictable_data += unpredictable_count;
+                                                // change indicator
+                                                indicator_pos[k] = 1;
+                                        }// end SZ
+                                        reg_params_pos ++;
+                                        data_pos += current_blockcount_z;
+                                        pb_pos += current_blockcount_z;
+                                        next_pb_pos += current_blockcount_z;
+                                        type += current_blockcount_x * current_blockcount_y * current_blockcount_z;
+                                }
+                                if(strip_unpredictable_count > max_unpred_count){
+                                        max_unpred_count = strip_unpredictable_count;
+                                }
+                                total_unpred += strip_unpredictable_count;
+                                indicator_pos += num_z;
+                        }
+                        float * tmp;
+                        tmp = cur_pb_buf;
+                        cur_pb_buf = next_pb_buf;
+                        next_pb_buf = tmp;
+                }
+        }
+        free(prediction_buffer_1);
+        free(prediction_buffer_2);
+        int stateNum = 2*quantization_intervals;
+        HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+        size_t nodeCount = 0;
+        init(huffmanTree, result_type, num_elements);
+        size_t i = 0;
+        for (i = 0; i < huffmanTree->stateNum; i++)
+                if (huffmanTree->code[i]) nodeCount++;
+        nodeCount = nodeCount*2-1;
+        unsigned char *treeBytes;
+        unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+        unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+        // total size                                                                           metadata                  # elements     real precision         intervals       nodeCount               huffman                 block index                                             unpredicatable count                                            mean                                            unpred size                             elements
+        unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+        unsigned char * result_pos = result;
+        initRandomAccessBytes(result_pos);
+        result_pos += meta_data_offset;
+        sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+        result_pos += exe_params->SZ_SIZE_TYPE;
+        intToBytes_bigEndian(result_pos, block_size);
+        result_pos += sizeof(int);
+        doubleToBytes(result_pos, realPrecision);
+        result_pos += sizeof(double);
+        intToBytes_bigEndian(result_pos, quantization_intervals);
+        result_pos += sizeof(int);
+        intToBytes_bigEndian(result_pos, treeByteSize);
+        result_pos += sizeof(int);
+        intToBytes_bigEndian(result_pos, nodeCount);
+        result_pos += sizeof(int);
+        memcpy(result_pos, treeBytes, treeByteSize);
+        result_pos += treeByteSize;
+        free(treeBytes);
+        memcpy(result_pos, &use_mean, sizeof(unsigned char));
+        result_pos += sizeof(unsigned char);
+        memcpy(result_pos, &mean, sizeof(float));
+        result_pos += sizeof(float);
+        size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+        result_pos += indicator_size;
+        //convert the lead/mid/resi to byte stream
+        if(reg_count > 0){
+                for(int e=0; e<4; e++){
+                        int stateNum = 2*coeff_intvCapacity_sz;
+                        HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+                        size_t nodeCount = 0;
+                        init(huffmanTree, coeff_type[e], reg_count);
+                        size_t i = 0;
+                        for (i = 0; i < huffmanTree->stateNum; i++)
+                                if (huffmanTree->code[i]) nodeCount++;
+                        nodeCount = nodeCount*2-1;
+                        unsigned char *treeBytes;
+                        unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+                        doubleToBytes(result_pos, precision[e]);
+                        result_pos += sizeof(double);
+                        intToBytes_bigEndian(result_pos, coeff_intvRadius);
+                        result_pos += sizeof(int);
+                        intToBytes_bigEndian(result_pos, treeByteSize);
+                        result_pos += sizeof(int);
+                        intToBytes_bigEndian(result_pos, nodeCount);
+                        result_pos += sizeof(int);
+                        memcpy(result_pos, treeBytes, treeByteSize);
+                        result_pos += treeByteSize;
+                        free(treeBytes);
+                        size_t typeArray_size = 0;
+                        encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+                        sizeToBytes(result_pos, typeArray_size);
+                        result_pos += sizeof(size_t) + typeArray_size;
+                        intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+                        result_pos += sizeof(int);
+                        memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+                        result_pos += coeff_unpredictable_count[e]*sizeof(float);
+                        SZ_ReleaseHuffman(huffmanTree);
+                }
+        }
+        free(coeff_result_type);
+        free(coeff_unpredictable_data);
+        //record the number of unpredictable data and also store them
+        memcpy(result_pos, &total_unpred, sizeof(size_t));
+        result_pos += sizeof(size_t);
+        memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+        result_pos += total_unpred * sizeof(float);
+        size_t typeArray_size = 0;
+        encode(huffmanTree, result_type, num_elements, result_pos, &typeArray_size);
+        result_pos += typeArray_size;
+        size_t totalEncodeSize = result_pos - result;
+        free(indicator);
+        free(result_unpredictable_data);
+        free(result_type);
+        free(reg_params);
+        SZ_ReleaseHuffman(huffmanTree);
+        *comp_size = totalEncodeSize;
+        return result;
+}
+unsigned char * SZ_compress_float_3D_MDQ_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+        unsigned int quantization_intervals;
+        float sz_sample_correct_freq = -1;//0.5; //-1
+        float dense_pos;
+        float mean_flush_freq;
+        unsigned char use_mean = 0;
+        // calculate block dims
+        size_t num_x, num_y, num_z;
+        size_t block_size = 6;
+        num_x = (r1 - 1) / block_size + 1;
+        num_y = (r2 - 1) / block_size + 1;
+        num_z = (r3 - 1) / block_size + 1;
+        size_t max_num_block_elements = block_size * block_size * block_size;
+        size_t num_blocks = num_x * num_y * num_z;
+        size_t num_elements = r1 * r2 * r3;
+        size_t dim0_offset = r2 * r3;
+        size_t dim1_offset = r3;
+        int * result_type = (int *) malloc(num_blocks*max_num_block_elements * sizeof(int));
+        size_t unpred_data_max_size = max_num_block_elements;
+        float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+        size_t total_unpred = 0;
+        size_t unpredictable_count;
+        float * data_pos = oriData;
+        int * type = result_type;
+        float * reg_params = (float *) malloc(num_blocks * 4 * sizeof(float));
+        float * reg_params_pos = reg_params;
+        // move regression part out
+        size_t params_offset_b = num_blocks;
+        size_t params_offset_c = 2*num_blocks;
+        size_t params_offset_d = 3*num_blocks;
+        float * pred_buffer = (float *) malloc((block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+        float * pred_buffer_pos = NULL;
+        float * block_data_pos_x = NULL;
+        float * block_data_pos_y = NULL;
+        float * block_data_pos_z = NULL;
+        for(size_t i=0; i<num_x; i++){
+                for(size_t j=0; j<num_y; j++){
+                        for(size_t k=0; k<num_z; k++){
+                                data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+                                pred_buffer_pos = pred_buffer;
+                                block_data_pos_x = data_pos;
+                                // use the buffer as block_size*block_size*block_size
+                                for(int ii=0; ii<block_size; ii++){
+                                        block_data_pos_y = block_data_pos_x;
+                                        for(int jj=0; jj<block_size; jj++){
+                                                block_data_pos_z = block_data_pos_y;
+                                                for(int kk=0; kk<block_size; kk++){
+                                                        *pred_buffer_pos = *block_data_pos_z;
+                                                        if(k*block_size + kk + 1 < r3) block_data_pos_z ++;
+                                                        pred_buffer_pos ++;
+                                                }
+                                                if(j*block_size + jj + 1 < r2) block_data_pos_y += dim1_offset;
+                                        }
+                                        if(i*block_size + ii + 1 < r1) block_data_pos_x += dim0_offset;
+                                }
+                                /*Calculate regression coefficients*/
+                                {
+                                        float * cur_data_pos = pred_buffer;
+                                        float fx = 0.0;
+                                        float fy = 0.0;
+                                        float fz = 0.0;
+                                        float f = 0;
+                                        float sum_x, sum_y;
+                                        float curData;
+                                        for(size_t i=0; i<block_size; i++){
+                                                sum_x = 0;
+                                                for(size_t j=0; j<block_size; j++){
+                                                        sum_y = 0;
+                                                        for(size_t k=0; k<block_size; k++){
+                                                                curData = *cur_data_pos;
+                                                                sum_y += curData;
+                                                                fz += curData * k;
+                                                                cur_data_pos ++;
+                                                        }
+                                                        fy += sum_y * j;
+                                                        sum_x += sum_y;
+                                                }
+                                                fx += sum_x * i;
+                                                f += sum_x;
+                                        }
+                                        float coeff = 1.0 / (block_size * block_size * block_size);
+                                        reg_params_pos[0] = (2 * fx / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+                                        reg_params_pos[params_offset_b] = (2 * fy / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+                                        reg_params_pos[params_offset_c] = (2 * fz / (block_size - 1) - f) * 6 * coeff / (block_size + 1);
+                                        reg_params_pos[params_offset_d] = f * coeff - ((block_size - 1) * reg_params_pos[0] / 2 + (block_size - 1) * reg_params_pos[params_offset_b] / 2 + (block_size - 1) * reg_params_pos[params_offset_c] / 2);
+                                }
+                                reg_params_pos ++;
+                        }
+                }
+        }
+        //Compress coefficient arrays
+        double precision_a, precision_b, precision_c, precision_d;
+        float rel_param_err = 0.025;
+        precision_a = rel_param_err * realPrecision / block_size;
+        precision_b = rel_param_err * realPrecision / block_size;
+        precision_c = rel_param_err * realPrecision / block_size;
+        precision_d = rel_param_err * realPrecision;
+        if(exe_params->optQuantMode==1)
+        {
+                quantization_intervals = optimize_intervals_float_3D_with_freq_and_dense_pos(oriData, r1, r2, r3, realPrecision, &dense_pos, &sz_sample_correct_freq, &mean_flush_freq);
+                if(mean_flush_freq > 0.5 || mean_flush_freq > sz_sample_correct_freq) use_mean = 1;
+                updateQuantizationInfo(quantization_intervals);
+        }
+        else{
+                quantization_intervals = exe_params->intvCapacity;
+        }
+        float mean = 0;
+        if(use_mean){
+                // compute mean
+                double sum = 0.0;
+                size_t mean_count = 0;
+                for(size_t i=0; i<num_elements; i++){
+                        if(fabs(oriData[i] - dense_pos) < realPrecision){
+                                sum += oriData[i];
+                                mean_count ++;
+                        }
+                }
+                if(mean_count > 0) mean = sum / mean_count;
+        }
+        double tmp_realPrecision = realPrecision;
+        // use two prediction buffers for higher performance
+        float * unpredictable_data = result_unpredictable_data;
+        unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
+        memset(indicator, 0, num_blocks * sizeof(unsigned char));
+        size_t reg_count = 0;
+        unsigned char * indicator_pos = indicator;
+        int intvCapacity = exe_params->intvCapacity;
+        int intvRadius = exe_params->intvRadius;
+        int use_reg = 0;
+        float noise = realPrecision * 1.22;
+        reg_params_pos = reg_params;
+        // compress the regression coefficients on the fly
+        float last_coeffcients[4] = {0.0};
+        int coeff_intvCapacity_sz = 65536;
+        int coeff_intvRadius = coeff_intvCapacity_sz / 2;
+        int * coeff_type[4];
+        int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
+        float * coeff_unpred_data[4];
+        float * coeff_unpredictable_data = (float *) malloc(num_blocks*4*sizeof(float));
+        double precision[4];
+        precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+        for(int i=0; i<4; i++){
+                coeff_type[i] = coeff_result_type + i * num_blocks;
+                coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
+        }
+        int coeff_index = 0;
+        unsigned int coeff_unpredictable_count[4] = {0};
+        memset(pred_buffer, 0, (block_size+1)*(block_size+1)*(block_size+1)*sizeof(float));
+        int pred_buffer_block_size = block_size + 1;
+        int strip_dim0_offset = pred_buffer_block_size * pred_buffer_block_size;
+        int strip_dim1_offset = pred_buffer_block_size;
+        if(use_mean){
+                int intvCapacity_sz = intvCapacity - 2;
+                type = result_type;
+                for(size_t i=0; i<num_x; i++){
+                        for(size_t j=0; j<num_y; j++){
+                                for(size_t k=0; k<num_z; k++){
+                                        data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+                                        // add 1 in x, y, z offset
+                                        pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+                                        block_data_pos_x = data_pos;
+                                        for(int ii=0; ii<block_size; ii++){
+                                                block_data_pos_y = block_data_pos_x;
+                                                for(int jj=0; jj<block_size; jj++){
+                                                        block_data_pos_z = block_data_pos_y;
+                                                        for(int kk=0; kk<block_size; kk++){
+                                                                *pred_buffer_pos = *block_data_pos_z;
+                                                                if(k*block_size + kk + 1< r3) block_data_pos_z ++;
+                                                                pred_buffer_pos ++;
+                                                        }
+                                                        // add 1 in z offset
+                                                        pred_buffer_pos ++;
+                                                        if(j*block_size + jj + 1< r2) block_data_pos_y += dim1_offset;
+                                                }
+                                                // add 1 in y offset
+                                                pred_buffer_pos += pred_buffer_block_size;
+                                                if(i*block_size + ii + 1< r1) block_data_pos_x += dim0_offset;
+                                        }
+                                        /*sampling and decide which predictor*/
+                                        {
+                                                // sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+                                                float * cur_data_pos;
+                                                float curData;
+                                                float pred_reg, pred_sz;
+                                                float err_sz = 0.0, err_reg = 0.0;
+                                                int bmi = 0;
+                                                for(int i=2; i<=block_size; i++){
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                        err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                        err_reg += fabs(pred_reg - curData);
+                                                        bmi = block_size - i;
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + bmi;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                        err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                        err_reg += fabs(pred_reg - curData);
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + i;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                        err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                        err_reg += fabs(pred_reg - curData);
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + bmi;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                        err_sz += MIN(fabs(pred_sz - curData) + noise, fabs(mean - curData));
+                                                        err_reg += fabs(pred_reg - curData);
+                                                }
+                                                use_reg = (err_reg < err_sz);
+                                        }
+                                        if(use_reg){
+                                                {
+                                                        /*predict coefficients in current block via previous reg_block*/
+                                                        float cur_coeff;
+                                                        double diff, itvNum;
+                                                        for(int e=0; e<4; e++){
+                                                                cur_coeff = reg_params_pos[e*num_blocks];
+                                                                diff = cur_coeff - last_coeffcients[e];
+                                                                itvNum = fabs(diff)/precision[e] + 1;
+                                                                if (itvNum < coeff_intvCapacity_sz){
+                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                        coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+                                                                        last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                        if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){
+                                                                                coeff_type[e][coeff_index] = 0;
+                                                                                last_coeffcients[e] = cur_coeff;
+                                                                                coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                        }
+                                                                }
+                                                                else{
+                                                                        coeff_type[e][coeff_index] = 0;
+                                                                        last_coeffcients[e] = cur_coeff;
+                                                                        coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                }
+                                                        }
+                                                        coeff_index ++;
+                                                }
+                                                float curData;
+                                                float pred;
+                                                double itvNum;
+                                                double diff;
+                                                size_t index = 0;
+                                                size_t block_unpredictable_count = 0;
+                                                float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+                                                for(size_t ii=0; ii<block_size; ii++){
+                                                        for(size_t jj=0; jj<block_size; jj++){
+                                                                for(size_t kk=0; kk<block_size; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];
+                                                                        diff = curData - pred;
+                                                                        itvNum = fabs(diff)/tmp_realPrecision + 1;
+                                                                        if (itvNum < intvCapacity){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - pred)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        pred = curData;
+                                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                pred = curData;
+                                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                        }
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos ++;
+                                                        }
+                                                        cur_data_pos += pred_buffer_block_size;
+                                                }
+                                                total_unpred += block_unpredictable_count;
+                                                unpredictable_data += block_unpredictable_count;
+                                                reg_count ++;
+                                        }
+                                        else{
+                                                // use SZ
+                                                // SZ predication
+                                                unpredictable_count = 0;
+                                                float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+                                                float curData;
+                                                float pred3D;
+                                                double itvNum, diff;
+                                                size_t index = 0;
+                                                for(size_t ii=0; ii<block_size; ii++){
+                                                        for(size_t jj=0; jj<block_size; jj++){
+                                                                for(size_t kk=0; kk<block_size; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        if(fabs(curData - mean) <= realPrecision){
+                                                                                type[index] = 1;
+                                                                                *cur_data_pos = mean;
+                                                                        }
+                                                                        else
+                                                                        {
+                                                                                pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+                                                                                                 - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                                                diff = curData - pred3D;
+                                                                                itvNum = fabs(diff)/realPrecision + 1;
+                                                                                if (itvNum < intvCapacity_sz){
+                                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                                        type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                        *cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                                        if(fabs(curData - *cur_data_pos)>tmp_realPrecision){
+                                                                                                type[index] = 0;
+                                                                                                *cur_data_pos = curData;
+                                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                                        }
+                                                                                }
+                                                                                else{
+                                                                                        type[index] = 0;
+                                                                                        *cur_data_pos = curData;
+                                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos ++;
+                                                        }
+                                                        cur_data_pos += pred_buffer_block_size;
+                                                }
+                                                total_unpred += unpredictable_count;
+                                                unpredictable_data += unpredictable_count;
+                                                // change indicator
+                                                indicator_pos[k] = 1;
+                                        }// end SZ
+                                        reg_params_pos ++;
+                                        type += block_size * block_size * block_size;
+                                } // end k
+                                indicator_pos += num_z;
+                        }// end j
+                }// end i
+        }
+        else{
+                int intvCapacity_sz = intvCapacity - 2;
+                type = result_type;
+                for(size_t i=0; i<num_x; i++){
+                        for(size_t j=0; j<num_y; j++){
+                                for(size_t k=0; k<num_z; k++){
+                                        data_pos = oriData + i*block_size * dim0_offset + j*block_size * dim1_offset + k*block_size;
+                                        // add 1 in x, y, z offset
+                                        pred_buffer_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+                                        block_data_pos_x = data_pos;
+                                        for(int ii=0; ii<block_size; ii++){
+                                                block_data_pos_y = block_data_pos_x;
+                                                for(int jj=0; jj<block_size; jj++){
+                                                        block_data_pos_z = block_data_pos_y;
+                                                        for(int kk=0; kk<block_size; kk++){
+                                                                *pred_buffer_pos = *block_data_pos_z;
+                                                                if(k*block_size + kk < r3) block_data_pos_z ++;
+                                                                pred_buffer_pos ++;
+                                                        }
+                                                        // add 1 in z offset
+                                                        pred_buffer_pos ++;
+                                                        if(j*block_size + jj < r2) block_data_pos_y += dim1_offset;
+                                                }
+                                                // add 1 in y offset
+                                                pred_buffer_pos += pred_buffer_block_size;
+                                                if(i*block_size + ii < r1) block_data_pos_x += dim0_offset;
+                                        }
+                                        /*sampling*/
+                                        {
+                                                // sample point [1, 1, 1] [1, 1, 4] [1, 4, 1] [1, 4, 4] [4, 1, 1] [4, 1, 4] [4, 4, 1] [4, 4, 4]
+                                                float * cur_data_pos;
+                                                float curData;
+                                                float pred_reg, pred_sz;
+                                                float err_sz = 0.0, err_reg = 0.0;
+                                                int bmi;
+                                                for(int i=2; i<=block_size; i++){
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + i;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                        err_sz += fabs(pred_sz - curData) + noise;
+                                                        err_reg += fabs(pred_reg - curData);
+                                                        bmi = block_size - i;
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + i*pred_buffer_block_size + bmi;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * i + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                        err_sz += fabs(pred_sz - curData) + noise;
+                                                        err_reg += fabs(pred_reg - curData);
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + i;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * i + reg_params_pos[params_offset_d];
+                                                        err_sz += fabs(pred_sz - curData) + noise;
+                                                        err_reg += fabs(pred_reg - curData);
+                                                        cur_data_pos = pred_buffer + i*pred_buffer_block_size*pred_buffer_block_size + bmi*pred_buffer_block_size + bmi;
+                                                        curData = *cur_data_pos;
+                                                        pred_sz = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1] - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                        pred_reg = reg_params_pos[0] * i + reg_params_pos[params_offset_b] * bmi + reg_params_pos[params_offset_c] * bmi + reg_params_pos[params_offset_d];
+                                                        err_sz += fabs(pred_sz - curData) + noise;
+                                                        err_reg += fabs(pred_reg - curData);
+                                                }
+                                                use_reg = (err_reg < err_sz);
+                                        }
+                                        if(use_reg)
+                                        {
+                                                {
+                                                        /*predict coefficients in current block via previous reg_block*/
+                                                        float cur_coeff;
+                                                        double diff, itvNum;
+                                                        for(int e=0; e<4; e++){
+                                                                cur_coeff = reg_params_pos[e*num_blocks];
+                                                                diff = cur_coeff - last_coeffcients[e];
+                                                                itvNum = fabs(diff)/precision[e] + 1;
+                                                                if (itvNum < coeff_intvCapacity_sz){
+                                                                        if (diff < 0) itvNum = -itvNum;
+                                                                        coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
+                                                                        last_coeffcients[e] = last_coeffcients[e] + 2 * (coeff_type[e][coeff_index] - coeff_intvRadius) * precision[e];
+                                                                        //ganrantee comporession error against the case of machine-epsilon
+                                                                        if(fabs(cur_coeff - last_coeffcients[e])>precision[e]){
+                                                                                coeff_type[e][coeff_index] = 0;
+                                                                                last_coeffcients[e] = cur_coeff;
+                                                                                coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                        }
+                                                                }
+                                                                else{
+                                                                        coeff_type[e][coeff_index] = 0;
+                                                                        last_coeffcients[e] = cur_coeff;
+                                                                        coeff_unpred_data[e][coeff_unpredictable_count[e] ++] = cur_coeff;
+                                                                }
+                                                        }
+                                                        coeff_index ++;
+                                                }
+                                                float curData;
+                                                float pred;
+                                                double itvNum;
+                                                double diff;
+                                                size_t index = 0;
+                                                size_t block_unpredictable_count = 0;
+                                                float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+                                                for(size_t ii=0; ii<block_size; ii++){
+                                                        for(size_t jj=0; jj<block_size; jj++){
+                                                                for(size_t kk=0; kk<block_size; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];
+                                                                        diff = curData - pred;
+                                                                        itvNum = fabs(diff)/tmp_realPrecision + 1;
+                                                                        if (itvNum < intvCapacity){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - pred)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        pred = curData;
+                                                                                        unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                pred = curData;
+                                                                                unpredictable_data[block_unpredictable_count ++] = curData;
+                                                                        }
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos ++;
+                                                        }
+                                                        cur_data_pos += pred_buffer_block_size;
+                                                }
+                                                total_unpred += block_unpredictable_count;
+                                                unpredictable_data += block_unpredictable_count;
+                                                reg_count ++;
+                                        }
+                                        else{
+                                                // use SZ
+                                                // SZ predication
+                                                unpredictable_count = 0;
+                                                float * cur_data_pos = pred_buffer + pred_buffer_block_size*pred_buffer_block_size + pred_buffer_block_size + 1;
+                                                float curData;
+                                                float pred3D;
+                                                double itvNum, diff;
+                                                size_t index = 0;
+                                                for(size_t ii=0; ii<block_size; ii++){
+                                                        for(size_t jj=0; jj<block_size; jj++){
+                                                                for(size_t kk=0; kk<block_size; kk++){
+                                                                        curData = *cur_data_pos;
+                                                                        pred3D = cur_data_pos[-1] + cur_data_pos[-strip_dim1_offset]+ cur_data_pos[-strip_dim0_offset] - cur_data_pos[-strip_dim1_offset - 1]
+                                                                                         - cur_data_pos[-strip_dim0_offset - 1] - cur_data_pos[-strip_dim0_offset - strip_dim1_offset] + cur_data_pos[-strip_dim0_offset - strip_dim1_offset - 1];
+                                                                        diff = curData - pred3D;
+                                                                        itvNum = fabs(diff)/realPrecision + 1;
+                                                                        if (itvNum < intvCapacity_sz){
+                                                                                if (diff < 0) itvNum = -itvNum;
+                                                                                type[index] = (int) (itvNum/2) + intvRadius;
+                                                                                *cur_data_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+                                                                                //ganrantee comporession error against the case of machine-epsilon
+                                                                                if(fabs(curData - *cur_data_pos)>tmp_realPrecision){
+                                                                                        type[index] = 0;
+                                                                                        *cur_data_pos = curData;
+                                                                                        unpredictable_data[unpredictable_count ++] = curData;
+                                                                                }
+                                                                        }
+                                                                        else{
+                                                                                type[index] = 0;
+                                                                                *cur_data_pos = curData;
+                                                                                unpredictable_data[unpredictable_count ++] = curData;
+                                                                        }
+                                                                        index ++;
+                                                                        cur_data_pos ++;
+                                                                }
+                                                                cur_data_pos ++;
+                                                        }
+                                                        cur_data_pos += pred_buffer_block_size;
+                                                }
+                                                total_unpred += unpredictable_count;
+                                                unpredictable_data += unpredictable_count;
+                                                // change indicator
+                                                indicator_pos[k] = 1;
+                                        }// end SZ
+                                        reg_params_pos ++;
+                                        type += block_size * block_size * block_size;
+                                }
+                                indicator_pos += num_z;
+                        }
+                }
+        }
+        free(pred_buffer);
+        int stateNum = 2*quantization_intervals;
+        HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+        size_t nodeCount = 0;
+        init(huffmanTree, result_type, num_blocks*max_num_block_elements);
+        size_t i = 0;
+        for (i = 0; i < huffmanTree->stateNum; i++)
+                if (huffmanTree->code[i]) nodeCount++;
+        nodeCount = nodeCount*2-1;
+        unsigned char *treeBytes;
+        unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+        unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+        // total size                                                                           metadata                  # elements     real precision         intervals       nodeCount               huffman                 block index                                             unpredicatable count                                            mean                                            unpred size                             elements
+        unsigned char * result = (unsigned char *) calloc(meta_data_offset + exe_params->SZ_SIZE_TYPE + sizeof(double) + sizeof(int) + sizeof(int) + treeByteSize + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(unsigned short) + num_blocks * sizeof(float) + total_unpred * sizeof(float) + num_elements * sizeof(int), 1);
+        unsigned char * result_pos = result;
+        initRandomAccessBytes(result_pos);
+        result_pos += meta_data_offset;
+        sizeToBytes(result_pos,num_elements); //SZ_SIZE_TYPE: 4 or 8
+        result_pos += exe_params->SZ_SIZE_TYPE;
+        intToBytes_bigEndian(result_pos, block_size);
+        result_pos += sizeof(int);
+        doubleToBytes(result_pos, realPrecision);
+        result_pos += sizeof(double);
+        intToBytes_bigEndian(result_pos, quantization_intervals);
+        result_pos += sizeof(int);
+        intToBytes_bigEndian(result_pos, treeByteSize);
+        result_pos += sizeof(int);
+        intToBytes_bigEndian(result_pos, nodeCount);
+        result_pos += sizeof(int);
+        memcpy(result_pos, treeBytes, treeByteSize);
+        result_pos += treeByteSize;
+        free(treeBytes);
+        memcpy(result_pos, &use_mean, sizeof(unsigned char));
+        result_pos += sizeof(unsigned char);
+        memcpy(result_pos, &mean, sizeof(float));
+        result_pos += sizeof(float);
+        size_t indicator_size = convertIntArray2ByteArray_fast_1b_to_result(indicator, num_blocks, result_pos);
+        result_pos += indicator_size;
+        //convert the lead/mid/resi to byte stream
+        if(reg_count > 0){
+                for(int e=0; e<4; e++){
+                        int stateNum = 2*coeff_intvCapacity_sz;
+                        HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+                        size_t nodeCount = 0;
+                        init(huffmanTree, coeff_type[e], reg_count);
+                        size_t i = 0;
+                        for (i = 0; i < huffmanTree->stateNum; i++)
+                                if (huffmanTree->code[i]) nodeCount++;
+                        nodeCount = nodeCount*2-1;
+                        unsigned char *treeBytes;
+                        unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+                        doubleToBytes(result_pos, precision[e]);
+                        result_pos += sizeof(double);
+                        intToBytes_bigEndian(result_pos, coeff_intvRadius);
+                        result_pos += sizeof(int);
+                        intToBytes_bigEndian(result_pos, treeByteSize);
+                        result_pos += sizeof(int);
+                        intToBytes_bigEndian(result_pos, nodeCount);
+                        result_pos += sizeof(int);
+                        memcpy(result_pos, treeBytes, treeByteSize);
+                        result_pos += treeByteSize;
+                        free(treeBytes);
+                        size_t typeArray_size = 0;
+                        encode(huffmanTree, coeff_type[e], reg_count, result_pos + sizeof(size_t), &typeArray_size);
+                        sizeToBytes(result_pos, typeArray_size);
+                        result_pos += sizeof(size_t) + typeArray_size;
+                        intToBytes_bigEndian(result_pos, coeff_unpredictable_count[e]);
+                        result_pos += sizeof(int);
+                        memcpy(result_pos, coeff_unpred_data[e], coeff_unpredictable_count[e]*sizeof(float));
+                        result_pos += coeff_unpredictable_count[e]*sizeof(float);
+                        SZ_ReleaseHuffman(huffmanTree);
+                }
+        }
+        free(coeff_result_type);
+        free(coeff_unpredictable_data);
+        //record the number of unpredictable data and also store them
+        memcpy(result_pos, &total_unpred, sizeof(size_t));
+        result_pos += sizeof(size_t);
+        memcpy(result_pos, result_unpredictable_data, total_unpred * sizeof(float));
+        result_pos += total_unpred * sizeof(float);
+        size_t typeArray_size = 0;
+        encode(huffmanTree, result_type, num_blocks*max_num_block_elements, result_pos, &typeArray_size);
+        result_pos += typeArray_size;
+        size_t totalEncodeSize = result_pos - result;
+        free(indicator);
+        free(result_unpredictable_data);
+        free(result_type);
+        free(reg_params);
+        SZ_ReleaseHuffman(huffmanTree);
+        *comp_size = totalEncodeSize;
+        return result;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 9ee2ce3 for thirdparty/SZ/sz/src/sz_float.c

Legend:

thirdparty/SZ/sz/src/sz_float.c

Download in other formats: