decode_idct.cl

#define MAX_COMPONENT_INFO_COUNT 5
#define DCTSIZE2 64
#define DCTSIZE 8
#define MAXJSAMPLE	255
#define CENTERJSAMPLE	128
typedef short JCOEF;
typedef JCOEF JBLOCK[DCTSIZE2];	/* one block of coefficients */
typedef unsigned char JSAMPLE;
typedef unsigned int JDIMENSION;
typedef int INT32;
typedef short INT16;
typedef float FAST_FLOAT;
typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
#define MULTIPLIER  int		/* type for fastest integer multiply */
// typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
struct ComponentInfo
{
    unsigned int MCU_width;
    unsigned int MCU_height;
    unsigned int last_col_width;
    unsigned int MCU_sample_width;
    unsigned int DCT_scaled_size;
    unsigned int row_buffer_size;
    unsigned int previous_image_size;
    unsigned int previous_decoded_mcu_size; 
    FLOAT_MULT_TYPE dct_table[DCTSIZE2];
};

struct DecodeInfo
{
   unsigned int componets_mcu_width;
   JSAMPLE  sample_range_limit[(5 * (MAXJSAMPLE+1) + CENTERJSAMPLE)]; 
   struct ComponentInfo component_infos[MAX_COMPONENT_INFO_COUNT]; 
};

#define IDCT_range_limit(cinfo)  ((cinfo)->sample_range_limit + CENTERJSAMPLE + (MAXJSAMPLE+1))
#define DEQUANTIZE(coef,quantval)  (((FAST_FLOAT) (coef)) * (quantval))
#define CONST_BITS  13
#define PASS1_BITS  2
#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
#define ONE	((INT32) 1)
#define RIGHT_SHIFT(x,shft)	((x) >> (shft))
#define RANGE_MASK  (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */

// void inverse_DCT(__global struct DecodeInfo * cinfo,
//                 __global struct ComponentInfo * compptr,
//                 __global JCOEF * coef_block,
//                 __global JSAMPLE * output_buf,
//                 JDIMENSION output_col)
// {
//   INT32 tmp0, tmp1, tmp2, tmp3;
//   INT32 tmp10, tmp11, tmp12, tmp13;
//   INT32 z1, z2, z3, z4, z5;
//   __global JCOEF * inptr;
//   __global ISLOW_MULT_TYPE * quantptr;
//   int * wsptr;
//   __global JSAMPLE * outptr;
//   __global JSAMPLE *range_limit = IDCT_range_limit(cinfo);
//   int ctr;
//   int workspace[DCTSIZE2];	/* buffers data between passes */
// 
//   /* Pass 1: process columns from input, store into work array. */
//   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
//   /* furthermore, we scale the results by 2**PASS1_BITS. */
// 
//   inptr = coef_block;
//   quantptr = (__global ISLOW_MULT_TYPE *) compptr->dct_table;
//   wsptr = workspace;
//   for (ctr = DCTSIZE; ctr > 0; ctr--) {
//     /* Due to quantization, we will usually find that many of the input
//      * coefficients are zero, especially the AC terms.  We can exploit this
//      * by short-circuiting the IDCT calculation for any column in which all
//      * the AC terms are zero.  In that case each output is equal to the
//      * DC coefficient (with scale factor as needed).
//      * With typical images and quantization tables, half or more of the
//      * column DCT calculations can be simplified this way.
//      */
//     
//     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
// 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
// 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
// 	inptr[DCTSIZE*7] == 0) {
//       /* AC terms all zero */
//       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
//       
//       wsptr[DCTSIZE*0] = dcval;
//       wsptr[DCTSIZE*1] = dcval;
//       wsptr[DCTSIZE*2] = dcval;
//       wsptr[DCTSIZE*3] = dcval;
//       wsptr[DCTSIZE*4] = dcval;
//       wsptr[DCTSIZE*5] = dcval;
//       wsptr[DCTSIZE*6] = dcval;
//       wsptr[DCTSIZE*7] = dcval;
//       
//       inptr++;			/* advance pointers to next column */
//       quantptr++;
//       wsptr++;
//       continue;
//     }
//     /* Even part: reverse the even part of the forward DCT. */
//     /* The rotator is sqrt(2)*c(-6). */
//     
//     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
//     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
//     
//     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
//     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
//     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
//     
//     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
//     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
// 
//     tmp0 = (z2 + z3) << CONST_BITS;
//     tmp1 = (z2 - z3) << CONST_BITS;
//     
//     tmp10 = tmp0 + tmp3;
//     tmp13 = tmp0 - tmp3;
//     tmp11 = tmp1 + tmp2;
//     tmp12 = tmp1 - tmp2;
//     
//     /* Odd part per figure 8; the matrix is unitary and hence its
//      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
//      */
//     
//     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
//     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
//     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
//     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
//     
//     z1 = tmp0 + tmp3;
//     z2 = tmp1 + tmp2;
//     z3 = tmp0 + tmp2;
//     z4 = tmp1 + tmp3;
//     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
//     
//     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
//     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
//     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
//     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
//     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
//     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
//     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
//     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
//     
//     z3 += z5;
//     z4 += z5;
//     
//     tmp0 += z1 + z3;
//     tmp1 += z2 + z4;
//     tmp2 += z2 + z3;
//     tmp3 += z1 + z4;
//     
//     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
//     
//     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
//     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
//     
//     inptr++;			/* advance pointers to next column */
//     quantptr++;
//     wsptr++;
//   }
//   
//   /* Pass 2: process rows from work array, store into output array. */
//   /* Note that we must descale the results by a factor of 8 == 2**3, */
//   /* and also undo the PASS1_BITS scaling. */
// 
//   wsptr = workspace;
//   for (ctr = 0; ctr < DCTSIZE; ctr++) {
//     outptr = output_buf + ctr * compptr->row_buffer_size + output_col;
//     /* Rows of zeroes can be exploited in the same way as we did with columns.
//      * However, the column calculation has created many nonzero AC terms, so
//      * the simplification applies less often (typically 5% to 10% of the time).
//      * On machines with very fast multiplication, it's possible that the
//      * test takes more time than it's worth.  In that case this section
//      * may be commented out.
//      */
//     
// #ifndef NO_ZERO_ROW_TEST
//     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
// 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
//       /* AC terms all zero */
//       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
// 				  & RANGE_MASK];
//       
//       outptr[0] = dcval;
//       outptr[1] = dcval;
//       outptr[2] = dcval;
//       outptr[3] = dcval;
//       outptr[4] = dcval;
//       outptr[5] = dcval;
//       outptr[6] = dcval;
//       outptr[7] = dcval;
// 
//       wsptr += DCTSIZE;		/* advance pointer to next row */
//       continue;
//     }
// #endif
//     
//     /* Even part: reverse the even part of the forward DCT. */
//     /* The rotator is sqrt(2)*c(-6). */
//     
//     z2 = (INT32) wsptr[2];
//     z3 = (INT32) wsptr[6];
//     
//     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
//     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
//     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
//     
//     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
//     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
//     
//     tmp10 = tmp0 + tmp3;
//     tmp13 = tmp0 - tmp3;
//     tmp11 = tmp1 + tmp2;
//     tmp12 = tmp1 - tmp2;
//     
//     /* Odd part per figure 8; the matrix is unitary and hence its
//      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
//      */
//     
//     tmp0 = (INT32) wsptr[7];
//     tmp1 = (INT32) wsptr[5];
//     tmp2 = (INT32) wsptr[3];
//     tmp3 = (INT32) wsptr[1];
//     
//     z1 = tmp0 + tmp3;
//     z2 = tmp1 + tmp2;
//     z3 = tmp0 + tmp2;
//     z4 = tmp1 + tmp3;
//     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
//     
//     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
//     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
//     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
//     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
//     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
//     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
//     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
//     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
//     
//     z3 += z5;
//     z4 += z5;
//     
//     tmp0 += z1 + z3;
//     tmp1 += z2 + z4;
//     tmp2 += z2 + z3;
//     tmp3 += z1 + z4;
//     
//     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
//     
//     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
// 					  CONST_BITS+PASS1_BITS+3)
// 			    & RANGE_MASK];
//     
//     wsptr += DCTSIZE;		/* advance pointer to next row */
//   }
// }


void inverse_DCT(__global struct DecodeInfo * cinfo,
                __global struct ComponentInfo * compptr,
                __global JCOEF * coef_block,
                __global JSAMPLE * output_buf,
                JDIMENSION output_col)
{
  FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
  FAST_FLOAT z5, z10, z11, z12, z13;
  __global JCOEF * inptr;
  __global FLOAT_MULT_TYPE * quantptr;
  __local FAST_FLOAT * wsptr;
  __global JSAMPLE * outptr;
  __global JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  int ctr;
  __local FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */

  /* Pass 1: process columns from input, store into work array. */

  inptr = coef_block;
  quantptr = (__global FLOAT_MULT_TYPE *) compptr->dct_table;
  wsptr = workspace;
  ctr = get_local_id(2);
  inptr += ctr;
  quantptr += ctr;
  wsptr += ctr;

    /* Due to quantization, we will usually find that many of the input
     * coefficients are zero, especially the AC terms.  We can exploit this
     * by short-circuiting the IDCT calculation for any column in which all
     * the AC terms are zero.  In that case each output is equal to the
     * DC coefficient (with scale factor as needed).
     * With typical images and quantization tables, half or more of the
     * column DCT calculations can be simplified this way.
     */
    
    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
	inptr[DCTSIZE*7] == 0) {
      /* AC terms all zero */
      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
      
      wsptr[DCTSIZE*0] = dcval;
      wsptr[DCTSIZE*1] = dcval;
      wsptr[DCTSIZE*2] = dcval;
      wsptr[DCTSIZE*3] = dcval;
      wsptr[DCTSIZE*4] = dcval;
      wsptr[DCTSIZE*5] = dcval;
      wsptr[DCTSIZE*6] = dcval;
      wsptr[DCTSIZE*7] = dcval;
      
    }
    else
    {
        /* Even part */

        tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
        tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
        tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
        tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);

        tmp10 = tmp0 + tmp2;	/* phase 3 */
        tmp11 = tmp0 - tmp2;

        tmp13 = tmp1 + tmp3;	/* phases 5-3 */
        tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */

        tmp0 = tmp10 + tmp13;	/* phase 2 */
        tmp3 = tmp10 - tmp13;
        tmp1 = tmp11 + tmp12;
        tmp2 = tmp11 - tmp12;
        
        /* Odd part */

        tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
        tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
        tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
        tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);

        z13 = tmp6 + tmp5;		/* phase 6 */
        z10 = tmp6 - tmp5;
        z11 = tmp4 + tmp7;
        z12 = tmp4 - tmp7;

        tmp7 = z11 + z13;		/* phase 5 */
        tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */

        z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
        tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
        tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */

        tmp6 = tmp12 - tmp7;	/* phase 2 */
        tmp5 = tmp11 - tmp6;
        tmp4 = tmp10 + tmp5;

        wsptr[DCTSIZE*0] = tmp0 + tmp7;
        wsptr[DCTSIZE*7] = tmp0 - tmp7;
        wsptr[DCTSIZE*1] = tmp1 + tmp6;
        wsptr[DCTSIZE*6] = tmp1 - tmp6;
        wsptr[DCTSIZE*2] = tmp2 + tmp5;
        wsptr[DCTSIZE*5] = tmp2 - tmp5;
        wsptr[DCTSIZE*4] = tmp3 + tmp4;
        wsptr[DCTSIZE*3] = tmp3 - tmp4;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  
  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3. */

  wsptr = workspace;
  wsptr +=  DCTSIZE * ctr;
   {
    outptr = output_buf + ctr * compptr->row_buffer_size + output_col;
    /* Rows of zeroes can be exploited in the same way as we did with columns.
     * However, the column calculation has created many nonzero AC terms, so
     * the simplification applies less often (typically 5% to 10% of the time).
     * And testing floats for zero is relatively expensive, so we don't bother.
     */
    
    /* Even part */

    tmp10 = wsptr[0] + wsptr[4];
    tmp11 = wsptr[0] - wsptr[4];

    tmp13 = wsptr[2] + wsptr[6];
    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    /* Odd part */

    z13 = wsptr[5] + wsptr[3];
    z10 = wsptr[5] - wsptr[3];
    z11 = wsptr[1] + wsptr[7];
    z12 = wsptr[1] - wsptr[7];

    tmp7 = z11 + z13;
    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);

    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    /* Final output stage: scale down by a factor of 8 and range-limit */

    outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)
			    & RANGE_MASK];
    outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)
			    & RANGE_MASK];
    outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)
			    & RANGE_MASK];
    outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)
			    & RANGE_MASK];
    outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)
			    & RANGE_MASK];
    outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)
			    & RANGE_MASK];
    outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)
			    & RANGE_MASK];
    outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)
			    & RANGE_MASK];
    
  }
}

__kernel void idct(__global struct DecodeInfo * cinfo,
               __global JBLOCK * decoded_mcu_base,
               __global JSAMPLE *  output)
{
   __global struct ComponentInfo * compptr;
   JDIMENSION MCU_col_num;	/* index of current MCU within row */
   int  ci, xindex, yindex, yoffset,yheightoffset, useful_width;
   int last_MCU_col;
   JDIMENSION start_col, output_col;
   __global JSAMPLE *cur_row;
   __global JBLOCK * sCurrentBlock;
   int MCUs_per_row;

   yheightoffset = get_global_id(0);
   MCU_col_num = get_global_id(1);
   ci = get_group_id(2);
   MCUs_per_row = get_global_size(1);
   last_MCU_col = MCUs_per_row - 1;

   cur_row = output;
   compptr = &cinfo->component_infos[ci];
   cur_row += compptr->previous_image_size;
   cur_row +=  yheightoffset * compptr->DCT_scaled_size * compptr->row_buffer_size * compptr->MCU_height ;
   start_col = MCU_col_num * compptr->MCU_sample_width;
   useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
       : compptr->last_col_width;
   sCurrentBlock = decoded_mcu_base + (( yheightoffset * MCUs_per_row  + MCU_col_num) * cinfo->componets_mcu_width) ;
   sCurrentBlock += compptr->previous_decoded_mcu_size;
   for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
       output_col = start_col;
       for (xindex = 0; xindex < useful_width; xindex++) {
           inverse_DCT (cinfo, compptr,
                   (__global JCOEF * ) (sCurrentBlock +  xindex),
                   cur_row, output_col);
           output_col += compptr->DCT_scaled_size;
       }
       sCurrentBlock += compptr->MCU_width;
       cur_row += compptr->DCT_scaled_size * compptr->row_buffer_size ;
   }
}