unit imjidctred;


{ This file contains inverse-DCT routines that produce reduced-size output:
  either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.

  The implementation is based on the Loeffler, Ligtenberg and Moschytz (LL&M)
  algorithm used in jidctint.c.  We simply replace each 8-to-8 1-D IDCT step
  with an 8-to-4 step that produces the four averages of two adjacent outputs
  (or an 8-to-2 step producing two averages of four outputs, for 2x2 output).
  These steps were derived by computing the corresponding values at the end
  of the normal LL&M code, then simplifying as much as possible.

  1x1 is trivial: just take the DC coefficient divided by 8.

  See jidctint.c for additional comments. }


{ Original : jidctred.c ; Copyright (C) 1994-1998, Thomas G. Lane. }

interface

{$I imjconfig.inc}

uses
  imjmorecfg,
  imjinclude,
  imjpeglib,
  imjdct;         	{ Private declarations for DCT subsystem }

{ Perform dequantization and inverse DCT on one block of coefficients,
  producing a reduced-size 1x1 output block. }

{GLOBAL}
procedure jpeg_idct_1x1 (cinfo : j_decompress_ptr;
                         compptr : jpeg_component_info_ptr;
	                 coef_block : JCOEFPTR;
	                 output_buf : JSAMPARRAY;
                         output_col : JDIMENSION);

{ Perform dequantization and inverse DCT on one block of coefficients,
  producing a reduced-size 2x2 output block. }

{GLOBAL}
procedure jpeg_idct_2x2 (cinfo : j_decompress_ptr;
                         compptr : jpeg_component_info_ptr;
	                 coef_block : JCOEFPTR;
                         output_buf : JSAMPARRAY;
                         output_col : JDIMENSION);

{ Perform dequantization and inverse DCT on one block of coefficients,
  producing a reduced-size 4x4 output block. }

{GLOBAL}
procedure jpeg_idct_4x4 (cinfo : j_decompress_ptr;
                         compptr : jpeg_component_info_ptr;
	                 coef_block : JCOEFPTR;
	                 output_buf : JSAMPARRAY;
                         output_col : JDIMENSION);

implementation

{ This module is specialized to the case DCTSIZE = 8. }

{$ifndef DCTSIZE_IS_8}
  Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
{$endif}


{ Scaling is the same as in jidctint.c. }

{$ifdef BITS_IN_JSAMPLE_IS_8}
const
  CONST_BITS = 13;
  PASS1_BITS = 2;
{$else}
const
  CONST_BITS = 13;
  PASS1_BITS = 1;	{ lose a little precision to avoid overflow }
{$endif}

const
  FIX_0_211164243 = INT32(Round((INT32(1) shl CONST_BITS) * 0.211164243)); {1730}
  FIX_0_509795579 = INT32(Round((INT32(1) shl CONST_BITS) * 0.509795579)); {4176}
  FIX_0_601344887 = INT32(Round((INT32(1) shl CONST_BITS) * 0.601344887)); {4926}
  FIX_0_720959822 = INT32(Round((INT32(1) shl CONST_BITS) * 0.720959822)); {5906}
  FIX_0_765366865 = INT32(Round((INT32(1) shl CONST_BITS) * 0.765366865)); {6270}
  FIX_0_850430095 = INT32(Round((INT32(1) shl CONST_BITS) * 0.850430095)); {6967}
  FIX_0_899976223 = INT32(Round((INT32(1) shl CONST_BITS) * 0.899976223)); {7373}
  FIX_1_061594337 = INT32(Round((INT32(1) shl CONST_BITS) * 1.061594337)); {8697}
  FIX_1_272758580 = INT32(Round((INT32(1) shl CONST_BITS) * 1.272758580)); {10426}
  FIX_1_451774981 = INT32(Round((INT32(1) shl CONST_BITS) * 1.451774981)); {11893}
  FIX_1_847759065 = INT32(Round((INT32(1) shl CONST_BITS) * 1.847759065)); {15137}
  FIX_2_172734803 = INT32(Round((INT32(1) shl CONST_BITS) * 2.172734803)); {17799}
  FIX_2_562915447 = INT32(Round((INT32(1) shl CONST_BITS) * 2.562915447)); {20995}
  FIX_3_624509785 = INT32(Round((INT32(1) shl CONST_BITS) * 3.624509785)); {29692}


{ Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
  For 8-bit samples with the recommended scaling, all the variable
  and constant values involved are no more than 16 bits wide, so a
  16x16->32 bit multiply can be used instead of a full 32x32 multiply.
  For 12-bit samples, a full 32-bit multiplication will be needed. }

{$ifdef BITS_IN_JSAMPLE_IS_8}

   {function Multiply(X, Y: Integer): integer; assembler;
   asm
     mov ax, X
     imul Y
     mov al, ah
     mov ah, dl
   end;}

   {MULTIPLY16C16(var,const)}
   function Multiply(X, Y: Integer): INT32;
   begin
     Multiply := X*INT32(Y);
   end;


{$else}
   function Multiply(X, Y: INT32): INT32;
   begin
     Multiply := X*Y;
   end;
{$endif}


{ Dequantize a coefficient by multiplying it by the multiplier-table
  entry; produce an int result.  In this module, both inputs and result
  are 16 bits or less, so either int or short multiply will work. }

function DEQUANTIZE(coef,quantval : int) : int;
begin
  Dequantize := ( ISLOW_MULT_TYPE(coef) * quantval);
end;


{ Descale and correctly round an INT32 value that's scaled by N bits.
  We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  the fudge factor is correct for either sign of X. }

function DESCALE(x : INT32; n : int) : INT32;
var
  shift_temp : INT32;
begin
{$ifdef RIGHT_SHIFT_IS_UNSIGNED}
  shift_temp := x + (INT32(1) shl (n-1));
  if shift_temp < 0 then
    Descale :=  (shift_temp shr n) or ((not INT32(0)) shl (32-n))
  else
    Descale :=  (shift_temp shr n);
{$else}
  Descale := (x + (INT32(1) shl (n-1)) shr n;
{$endif}
end;

{ Perform dequantization and inverse DCT on one block of coefficients,
  producing a reduced-size 4x4 output block. }

{GLOBAL}
procedure jpeg_idct_4x4 (cinfo : j_decompress_ptr;
                         compptr : jpeg_component_info_ptr;
	                 coef_block : JCOEFPTR;
	                 output_buf : JSAMPARRAY;
                         output_col : JDIMENSION);
type
  PWorkspace = ^TWorkspace;
  TWorkspace = array[0..(DCTSIZE*4)-1] of int; { buffers data between passes }
var
  tmp0, tmp2, tmp10, tmp12 : INT32;
  z1, z2, z3, z4 : INT32;
  inptr : JCOEFPTR;
  quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
  wsptr : PWorkspace;
  outptr : JSAMPROW;
  range_limit : JSAMPROW;
  ctr : int;
  workspace : TWorkspace;	{ buffers data between passes }
  {SHIFT_TEMPS}
var
  dcval : int;
var
  dcval_ : JSAMPLE;
begin
{ Each IDCT routine is responsible for range-limiting its results and
  converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
  be quite far out of range if the input data is corrupt, so a bulletproof
  range-limiting step is required.  We use a mask-and-table-lookup method
  to do the combined operations quickly.  See the comments with
  prepare_range_limit_table (in jdmaster.c) for more info. }

  range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));

  { Pass 1: process columns from input, store into work array. }

  inptr := coef_block;
  quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);
  wsptr := @workspace;
  for ctr := DCTSIZE downto 1 do
  begin
    { Don't bother to process column 4, because second pass won't use it }
    if (ctr = DCTSIZE-4) then
    begin
      Inc(JCOEF_PTR(inptr));
      Inc(ISLOW_MULT_TYPE_PTR(quantptr));
      Inc(int_ptr(wsptr));

      continue;
    end;
    if (inptr^[DCTSIZE*1]=0) and (inptr^[DCTSIZE*2]=0) and (inptr^[DCTSIZE*3]=0) and
       (inptr^[DCTSIZE*5]=0) and (inptr^[DCTSIZE*6]=0) and (inptr^[DCTSIZE*7]=0) then
    begin
      { AC terms all zero; we need not examine term 4 for 4x4 output }
      dcval := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) *
                      quantptr^[DCTSIZE*0]) shl PASS1_BITS;

      wsptr^[DCTSIZE*0] := dcval;
      wsptr^[DCTSIZE*1] := dcval;
      wsptr^[DCTSIZE*2] := dcval;
      wsptr^[DCTSIZE*3] := dcval;

      Inc(JCOEF_PTR(inptr));
      Inc(ISLOW_MULT_TYPE_PTR(quantptr));
      Inc(int_ptr(wsptr));

      continue;
    end;

    { Even part }

    tmp0 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) * quantptr^[DCTSIZE*0]);

    tmp0 := tmp0 shl (CONST_BITS+1);

    z2 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*2]) * quantptr^[DCTSIZE*2]);
    z3 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*6]) * quantptr^[DCTSIZE*6]);

    tmp2 := MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);

    tmp10 := tmp0 + tmp2;
    tmp12 := tmp0 - tmp2;

    { Odd part }

    z1 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*7]) * quantptr^[DCTSIZE*7];
    z2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*5]) * quantptr^[DCTSIZE*5];
    z3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*3]) * quantptr^[DCTSIZE*3];
    z4 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*1]) * quantptr^[DCTSIZE*1];

    tmp0 := MULTIPLY(z1, - FIX_0_211164243) { sqrt(2) * (c3-c1) }
	  + MULTIPLY(z2, FIX_1_451774981) { sqrt(2) * (c3+c7) }
	  + MULTIPLY(z3, - FIX_2_172734803) { sqrt(2) * (-c1-c5) }
	  + MULTIPLY(z4, FIX_1_061594337); { sqrt(2) * (c5+c7) }

    tmp2 := MULTIPLY(z1, - FIX_0_509795579) { sqrt(2) * (c7-c5) }
	  + MULTIPLY(z2, - FIX_0_601344887) { sqrt(2) * (c5-c1) }
	  + MULTIPLY(z3, FIX_0_899976223) { sqrt(2) * (c3-c7) }
	  + MULTIPLY(z4, FIX_2_562915447); { sqrt(2) * (c1+c3) }

    { Final output stage }

    wsptr^[DCTSIZE*0] := int(DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1));
    wsptr^[DCTSIZE*3] := int(DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1));
    wsptr^[DCTSIZE*1] := int(DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1));
    wsptr^[DCTSIZE*2] := int(DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1));

    Inc(JCOEF_PTR(inptr));
    Inc(ISLOW_MULT_TYPE_PTR(quantptr));
    Inc(int_ptr(wsptr));
  end;

  { Pass 2: process 4 rows from work array, store into output array. }

  wsptr := @workspace;
  for ctr := 0 to pred(4) do
  begin
    outptr := JSAMPROW(@ output_buf^[ctr]^[output_col]);
    { It's not clear whether a zero row test is worthwhile here ... }

{$ifndef NO_ZERO_ROW_TEST}
    if (wsptr^[1]=0) and (wsptr^[2]=0) and (wsptr^[3]=0) and
       (wsptr^[5]=0) and (wsptr^[6]=0) and (wsptr^[7]=0) then
    begin
      { AC terms all zero }
      dcval_ := range_limit^[int(DESCALE(INT32(wsptr^[0]), PASS1_BITS+3))
				  and RANGE_MASK];

      outptr^[0] := dcval_;
      outptr^[1] := dcval_;
      outptr^[2] := dcval_;
      outptr^[3] := dcval_;

      Inc(int_ptr(wsptr), DCTSIZE);	{ advance pointer to next row }
      continue;
    end;
{$endif}

    { Even part }

    tmp0 := (INT32(wsptr^[0])) shl (CONST_BITS+1);

    tmp2 := MULTIPLY(INT32(wsptr^[2]), FIX_1_847759065)
	  + MULTIPLY(INT32(wsptr^[6]), - FIX_0_765366865);

    tmp10 := tmp0 + tmp2;
    tmp12 := tmp0 - tmp2;

    { Odd part }

    z1 := INT32(wsptr^[7]);
    z2 := INT32(wsptr^[5]);
    z3 := INT32(wsptr^[3]);
    z4 := INT32(wsptr^[1]);

    tmp0 := MULTIPLY(z1, - FIX_0_211164243) { sqrt(2) * (c3-c1) }
	  + MULTIPLY(z2, FIX_1_451774981) { sqrt(2) * (c3+c7) }
	  + MULTIPLY(z3, - FIX_2_172734803) { sqrt(2) * (-c1-c5) }
	  + MULTIPLY(z4, FIX_1_061594337); { sqrt(2) * (c5+c7) }

    tmp2 := MULTIPLY(z1, - FIX_0_509795579) { sqrt(2) * (c7-c5) }
	  + MULTIPLY(z2, - FIX_0_601344887) { sqrt(2) * (c5-c1) }
	  + MULTIPLY(z3, FIX_0_899976223) { sqrt(2) * (c3-c7) }
	  + MULTIPLY(z4, FIX_2_562915447); { sqrt(2) * (c1+c3) }

    { Final output stage }

    outptr^[0] := range_limit^[ int(DESCALE(tmp10 + tmp2,
					  CONST_BITS+PASS1_BITS+3+1))
			    and RANGE_MASK];
    outptr^[3] := range_limit^[ int(DESCALE(tmp10 - tmp2,
					  CONST_BITS+PASS1_BITS+3+1))
			    and RANGE_MASK];
    outptr^[1] := range_limit^[ int(DESCALE(tmp12 + tmp0,
					  CONST_BITS+PASS1_BITS+3+1))
			    and RANGE_MASK];
    outptr^[2] := range_limit^[ int(DESCALE(tmp12 - tmp0,
				 	  CONST_BITS+PASS1_BITS+3+1))
			    and RANGE_MASK];

    Inc(int_ptr(wsptr), DCTSIZE);	{ advance pointer to next row }
  end;
end;


{ Perform dequantization and inverse DCT on one block of coefficients,
  producing a reduced-size 2x2 output block. }

{GLOBAL}
procedure jpeg_idct_2x2 (cinfo : j_decompress_ptr;
                         compptr : jpeg_component_info_ptr;
	                 coef_block : JCOEFPTR;
                         output_buf : JSAMPARRAY;
                         output_col : JDIMENSION);
type
  PWorkspace = ^TWorkspace;
  TWorkspace = array[0..(DCTSIZE*2)-1] of int; { buffers data between passes }
var
  tmp0, tmp10, z1 : INT32;
  inptr : JCOEFPTR;
  quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
  wsptr : PWorkspace;
  outptr : JSAMPROW;
  range_limit : JSAMPROW;
  ctr : int;
  workspace : TWorkspace;  { buffers data between passes }
  {SHIFT_TEMPS}
var
  dcval : int;
var
  dcval_ : JSAMPLE;
begin
{ Each IDCT routine is responsible for range-limiting its results and
  converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
  be quite far out of range if the input data is corrupt, so a bulletproof
  range-limiting step is required.  We use a mask-and-table-lookup method
  to do the combined operations quickly.  See the comments with
  prepare_range_limit_table (in jdmaster.c) for more info. }

  range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));
  { Pass 1: process columns from input, store into work array. }

  inptr := coef_block;
  quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);
  wsptr := @workspace;
  for ctr := DCTSIZE downto 1 do
  begin
    { Don't bother to process columns 2,4,6 }
    if (ctr = DCTSIZE-2) or (ctr = DCTSIZE-4) or (ctr = DCTSIZE-6) then
    begin
      Inc(JCOEF_PTR(inptr));
      Inc(ISLOW_MULT_TYPE_PTR(quantptr));
      Inc(int_ptr(wsptr));

      continue;
    end;
    if (inptr^[DCTSIZE*1]=0) and (inptr^[DCTSIZE*3]=0) and
       (inptr^[DCTSIZE*5]=0) and (inptr^[DCTSIZE*7]=0) then
    begin
      { AC terms all zero; we need not examine terms 2,4,6 for 2x2 output }
      dcval := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) *
                 quantptr^[DCTSIZE*0]) shl PASS1_BITS;

      wsptr^[DCTSIZE*0] := dcval;
      wsptr^[DCTSIZE*1] := dcval;

      Inc(JCOEF_PTR(inptr));
      Inc(ISLOW_MULT_TYPE_PTR(quantptr));
      Inc(int_ptr(wsptr));

      continue;
    end;

    { Even part }

    z1 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) * quantptr^[DCTSIZE*0]);

    tmp10 := z1 shl (CONST_BITS+2);

    { Odd part }

    z1 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*7]) * quantptr^[DCTSIZE*7]);
    tmp0 := MULTIPLY(z1, - FIX_0_720959822); { sqrt(2) * (c7-c5+c3-c1) }
    z1 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*5]) * quantptr^[DCTSIZE*5]);
    Inc(tmp0, MULTIPLY(z1, FIX_0_850430095)); { sqrt(2) * (-c1+c3+c5+c7) }
    z1 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*3]) * quantptr^[DCTSIZE*3]);
    Inc(tmp0, MULTIPLY(z1, - FIX_1_272758580)); { sqrt(2) * (-c1+c3-c5-c7) }
    z1 := (ISLOW_MULT_TYPE(inptr^[DCTSIZE*1]) * quantptr^[DCTSIZE*1]);
    Inc(tmp0, MULTIPLY(z1, FIX_3_624509785)); { sqrt(2) * (c1+c3+c5+c7) }

    { Final output stage }

    wsptr^[DCTSIZE*0] := int (DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2));
    wsptr^[DCTSIZE*1] := int (DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2));

    Inc(JCOEF_PTR(inptr));
    Inc(ISLOW_MULT_TYPE_PTR(quantptr));
    Inc(int_ptr(wsptr));
  end;

  { Pass 2: process 2 rows from work array, store into output array. }

  wsptr := @workspace;
  for ctr := 0 to pred(2) do
  begin
    outptr := JSAMPROW(@ output_buf^[ctr]^[output_col]);
    { It's not clear whether a zero row test is worthwhile here ... }

{$ifndef NO_ZERO_ROW_TEST}
    if (wsptr^[1]=0) and (wsptr^[3]=0) and (wsptr^[5]=0) and (wsptr^[7]= 0) then
    begin
      { AC terms all zero }
      dcval_ := range_limit^[ int(DESCALE(INT32(wsptr^[0]), PASS1_BITS+3))
				  and RANGE_MASK];

      outptr^[0] := dcval_;
      outptr^[1] := dcval_;

      Inc(int_ptr(wsptr), DCTSIZE);	{ advance pointer to next row }
      continue;
    end;
{$endif}

    { Even part }

    tmp10 := (INT32 (wsptr^[0])) shl (CONST_BITS+2);

    { Odd part }

    tmp0 := MULTIPLY( INT32(wsptr^[7]), - FIX_0_720959822) { sqrt(2) * (c7-c5+c3-c1) }
	  + MULTIPLY( INT32(wsptr^[5]), FIX_0_850430095) { sqrt(2) * (-c1+c3+c5+c7) }
	  + MULTIPLY( INT32(wsptr^[3]), - FIX_1_272758580) { sqrt(2) * (-c1+c3-c5-c7) }
	  + MULTIPLY( INT32(wsptr^[1]), FIX_3_624509785); { sqrt(2) * (c1+c3+c5+c7) }

    { Final output stage }

    outptr^[0] := range_limit^[ int(DESCALE(tmp10 + tmp0,
					  CONST_BITS+PASS1_BITS+3+2))
			    and RANGE_MASK];
    outptr^[1] := range_limit^[ int(DESCALE(tmp10 - tmp0,
					  CONST_BITS+PASS1_BITS+3+2))
			    and RANGE_MASK];

    Inc(int_ptr(wsptr), DCTSIZE);		{ advance pointer to next row }
  end;
end;


{ Perform dequantization and inverse DCT on one block of coefficients,
  producing a reduced-size 1x1 output block. }

{GLOBAL}
procedure jpeg_idct_1x1 (cinfo : j_decompress_ptr;
                         compptr : jpeg_component_info_ptr;
	                 coef_block : JCOEFPTR;
	                 output_buf : JSAMPARRAY;
                         output_col : JDIMENSION);
var
  dcval : int;
  quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
  range_limit : JSAMPROW;
  {SHIFT_TEMPS}
begin
{ Each IDCT routine is responsible for range-limiting its results and
  converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
  be quite far out of range if the input data is corrupt, so a bulletproof
  range-limiting step is required.  We use a mask-and-table-lookup method
  to do the combined operations quickly.  See the comments with
  prepare_range_limit_table (in jdmaster.c) for more info. }

  range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));
  { Pass 1: process columns from input, store into work array. }

  { We hardly need an inverse DCT routine for this: just take the
    average pixel value, which is one-eighth of the DC coefficient. }

  quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);
  dcval := (ISLOW_MULT_TYPE(coef_block^[0]) * quantptr^[0]);
  dcval := int (DESCALE( INT32(dcval), 3));

  output_buf^[0]^[output_col] := range_limit^[dcval and RANGE_MASK];
end;

end.