/* This is an experimental modification of this file.
 * The MPEG-Player is used as a real environment testcontainer to examine the
 * performance of the DCT algorithms.
 * It turned out, (in other words - it could be prooved at least on an intel-
 * based machine), that a higher decoding speed (and finally an up to 20 %
 * higher frame rate) can be achieved when following the proposals of Arai,
 * Agiu and Nakajima - and the extensions made by Feig.
 *
 * Please find a detailed description of the implementation in
 *    E.Feig,E.Linzer,Discrete Cosine Transform Algorithms for Image Data
 *           Compression, Proceedings Electronic Imaging '90 Eas, pp.84-7,
 *           Boston, MA(Oct.29th-Nov.1rst,1990) or see
 * W.B:PenneBaker,J.L.Mitchell, JPEG Still Image Compression Standard,
 *           VAN Nostrand Reinhold 1993
 *
 * Have fun.
 * <rst@hrz.tu-chemnitz.de>
 */

/*
 * This file contains the basic inverse-DCT transformation subroutine.
 */


#ifdef HAVE_MMX

#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>

#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>                        /* on BSD, uio.h needs types.h */
#include <sys/uio.h>
#include <X11/Xlib.h>
#include <X11/extensions/XShm.h>

#include "threads.h"
#include "config.h"
#include "common.h"
#include "mtime.h"
#include "plugins.h"

#include "intf_msg.h"
#include "debug.h"

#include "input.h"
#include "input_netlist.h"
#include "decoder_fifo.h"
#include "video.h"
#include "video_output.h"
#include "video_decoder.h"

#include "global.h"

/* global declarations */
void Initialize_Fast_IDCT (void);
void Fast_IDCT (DCTELEM *block);
extern void IDCT_mmx(DCTELEM *block);

/* private data */
static short iclip[1024]; /* clipping table */
static short *iclp;

/* two dimensional inverse discrete cosine transform */
void Fast_IDCT(block)
DCTELEM *block;
{
short temp;

  /* Bug de la chrominance qui saute. Peut-tre un rapport avec le
   * commentaire en haut de mmxidct.S (?) */
  if (picture_coding_type != I_TYPE)
    block[0] += 4;

  IDCT_mmx(block);

}

void Initialize_Fast_IDCT()
{
  int i;

  iclp = iclip+512;
  for (i= -512; i<512; i++)
    iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
}

void Fast_Sparse_IDCT(data, pos)
DCTELEM *data;
int pos;
{

    /* Note that pos is necessarily 0 in that IDCT */

    int val;
    val = iclp[((data[0] << 3) + 32) >> 6];

    /* oui je sais il manque un compteur */
                data[0] = data[1] = data[2] = data[3] = data[4] =
                data[5] = data[6] = data[7] = data[8] = data[9] =
                data[10] = data[11] = data[12] = data[13] = data[14] =
                data[15] = data[16] = data[17] = data[18] = data[19] =
                data[20] = data[21] = data[22] = data[23] = data[24] =
                data[25] = data[26] = data[27] = data[28] = data[29] =
                data[30] = data[31] = data[32] = data[33] = data[34] =
                data[35] = data[36] = data[37] = data[38] = data[39] =
                data[40] = data[41] = data[42] = data[43] = data[44] =
                data[45] = data[46] = data[47] = data[48] = data[49] =
                data[50] = data[51] = data[52] = data[53] = data[54] =
                data[55] = data[56] = data[57] = data[58] = data[59] =
                data[60] = data[61] = data[62] = data[63] = val;
}


void dct_dump(m1)
DCTELEM* m1;
{
    int i, j;
    printf("\n");
    for (j = 0; j < 8; j++) {
        for (i = 0; i < 8; i++) {
            printf("%d\t", (m1[j*8+i]));
        }
        printf("\n");
    }
}

void norm_quant_matrix(int * meuh) { }

#else

#include <math.h>
#include <string.h>
#include <stdio.h>

#include "config.h"
#include "idct_constants.h"

#define DCTSIZE 8
#define DCTELEM int
#define DCTBLOCK int*

#define CONST_BITS_JA 11
#define VAL_BITS 11
#define ALLBITS 22

#define TWO 1+CONST_BITS_JA
#define C6                       1567
#define C4C6                     2217
#define C4                       2896
#define Q                        2217
#define C4Q                      3135
#define R                        5352
#define C4R                      7568
#define PI M_PI


/* We assume that right shift corresponds to signed division by 2 with
 * rounding towards minus infinity.  This is correct for typical "arithmetic
 * shift" instructions that shift in copies of the sign bit.  But some
 * C compilers implement >> with an unsigned shift.
 */

/*
 * This routine is specialized to the case DCTSIZE = 8.
 */

#if DCTSIZE != 8
Sorry, this code only copes with 8 x8 DCTs.	/* deliberate syntax err */
#endif

/* Arrays temporarily used in j_rev_dct() */

int matr1[64];
int matr2[64];

/* Cf. Fast_IDCT */

extern int picture_coding_type;

/*
 *--------------------------------------------------------------
 *
 * init_pre_idct --
 *
 *  Pre-computes singleton coefficient IDCT values.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *--------------------------------------------------------------
 */
void Initialize_Fast_IDCT()
{
    /* Nothing, PreIDCT is already precomputed. */
}

/*
 *--------------------------------------------------------------
 *
 * j_rev_dct_sparse --
 *
 *  Performs the original inverse DCT on one block of
 *  coefficients.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *--------------------------------------------------------------
 */

void Fast_Sparse_IDCT(data, pos)
DCTBLOCK data;
int pos;
{
        int val, co;
        int p,i;
	int *tmpptr;

	// DC value
        if (pos == 0) {
                val = (data[0] >> VAL_BITS);
		p=0;
		data[0] = data[1] = data[2] = data[3] = data[4] =
		data[5] = data[6] = data[7] = data[8] = data[9] =
		data[10] = data[11] = data[12] = data[13] = data[14] =
		data[15] = data[16] = data[17] = data[18] = data[19] =
		data[20] = data[21] = data[22] = data[23] = data[24] =
		data[25] = data[26] = data[27] = data[28] = data[29] =
		data[30] = data[31] = data[32] = data[33] = data[34] =
		data[35] = data[36] = data[37] = data[38] = data[39] =
		data[40] = data[41] = data[42] = data[43] = data[44] =
		data[45] = data[46] = data[47] = data[48] = data[49] =
		data[50] = data[51] = data[52] = data[53] = data[54] =
		data[55] = data[56] = data[57] = data[58] = data[59] =
		data[60] = data[61] = data[62] = data[63] = val;
                return;
        }

        /* AC values: */

        /* perform the IDFT using the lookup table */
        co = data[pos];
        tmpptr = PreIDCT[pos];
	
        for (p = 0; p<64; p++) {
                data[p] = (tmpptr[p] * co) >> (VAL_BITS-2);
        }

}

/*
 *--------------------------------------------------------------
 *
 * j_rev_dct
 *
 *  The inverse DCT function. No tricks, no major optimizations.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *--------------------------------------------------------------
 */

void Fast_IDCT(DCTBLOCK coeff)
{
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int co1, co2, co3, co5, co6, co7, co35, co17;
    int n0, n1, n2, n3;
    int m0,m1, m2, m3, m4, m5, m6, m7, m8;
    int l0 = 0, l1 = 0, l2 = 0, l3 = 0;
    int g0, g1, g2, g3;
    int i, j, p;
    int tmp;

    short k;
    register int xxx;

    /* Encore un avatar du bug de la chrominance qui saute (y'a le mme
     * dans l'iDCT MMX). Srement une histoire d'arrondi apparemment. */
    if (picture_coding_type != 1)
        coeff[0] += 1024;

    /* compute B1 (horizontal / vertical algoritm): */
    /* (the vertical part is in tensor product) */

    for (k = 0; k < 64; k += 8) {
	matr1[k] = coeff[k + 0];
	matr1[k + 1] = coeff[k + 4];

	matr1[k + 2] = matr1[k + 3] = coeff[k + 2];
	matr1[k + 2] -= coeff[k + 6];
	matr1[k + 3] += coeff[k + 6];

	matr1[k + 4] = xxx = coeff[k + 5];
	matr1[k + 4] -= coeff[k + 3];
	xxx += coeff[k + 3];

	matr1[k + 5] = matr1[k + 6] = coeff[k + 1];
	matr1[k + 7] = (matr1[k + 5] += coeff[k + 7]);
	matr1[k + 6] -= coeff[k + 7];

	matr1[k + 7] += xxx;
	matr1[k + 5] -= xxx;
    }

    p = 0;

    /* line 0,  M x M */
    tmp4 = (co3 = matr1[24]) - (co5 = matr1[40]);
    tmp6 = (co1 = matr1[8]) - (co7 = matr1[56]);
    tmp = C6 * (tmp6 - tmp4);
    matr2[p++] = matr1[0] << CONST_BITS_JA;
    matr2[p++] = matr1[32] << CONST_BITS_JA;
    matr2[p++] = ((co2 = matr1[16]) - (co6 = matr1[48])) * C4;
    matr2[p++] = (co2 + co6) << CONST_BITS_JA;
    matr2[p++] = Q * tmp4 - tmp;
    matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4;
    matr2[p++] = R * tmp6 - tmp;
    matr2[p++] = (co17 + co35) << CONST_BITS_JA;

    /* line 1,  M x M */
    tmp4 = (co3 = matr1[25]) - (co5 = matr1[41]);
    tmp6 = (co1 = matr1[9]) - (co7 = matr1[57]);
    tmp = C6 * (tmp6 - tmp4);
    matr2[p++] = matr1[1] << CONST_BITS_JA;
    matr2[p++] = matr1[33] << CONST_BITS_JA;
    matr2[p++] = ((co2 = matr1[17]) - (co6 = matr1[49])) * C4;
    matr2[p++] = (co2 + co6) << CONST_BITS_JA;
    matr2[p++] = Q * tmp4 - tmp;
    matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4;
    matr2[p++] = R * tmp6 - tmp;
    matr2[p++] = (co17 + co35) << CONST_BITS_JA;

    /* line 2,  M x M */
    tmp4 = (co3 = matr1[26]) - (co5 = matr1[42]);
    tmp6 = (co1 = matr1[10]) - (co7 = matr1[58]);
    tmp = Q * (tmp6 - tmp4);
    matr2[p++] = C4 * matr1[2];
    matr2[p++] = C4 * matr1[34];
    matr2[p++] = ((co2 = matr1[18]) - (co6 = matr1[50])) << TWO;
    matr2[p++] = C4 * (co2 + co6);
    matr2[p++] = C4Q * tmp4 - tmp;
    matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) << TWO;
    matr2[p++] = C4R * tmp6 - tmp;
    matr2[p++] = C4 * (co17 + co35);

    /* line 3,  M x M */
    tmp4 = (co3 = matr1[27]) - (co5 = matr1[43]);
    tmp6 = (co1 = matr1[11]) - (co7 = matr1[59]);
    tmp = C6 * (tmp6 - tmp4);
    matr2[p++] = matr1[3] << CONST_BITS_JA;
    matr2[p++] = matr1[35] << CONST_BITS_JA;
    matr2[p++] = ((co2 = matr1[19]) - (co6 = matr1[51])) * C4;
    matr2[p++] = (co2 + co6) << CONST_BITS_JA;
    matr2[p++] = Q * tmp4 - tmp;
    matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4;
    matr2[p++] = R * tmp6 - tmp;
    matr2[p++] = (co17 + co35) << CONST_BITS_JA;

    /* line 4,  M x M */
    matr2[p++] = matr1[4];
    matr2[p++] = matr1[36];
    matr2[p++] = (co2 = matr1[20]) - (co6 = matr1[52]);
    matr2[p] = co2 + co6;
    l0 = l2 = -(co3 = matr1[28]) + (co5 = matr1[44]);
    p += 2;
    matr2[p] = (co17 = (co1 = matr1[12]) + (co7 = matr1[60])) - (co35 = co3 + co5);
    l3 = -(l1 = co1 - co7);
    p += 2;
    matr2[p++] = co17 + co35;

    /* line 5,  M x M */
    tmp4 = (co3 = matr1[29]) - (co5 = matr1[45]);
    tmp6 = (co1 = matr1[13]) - (co7 = matr1[61]);
    tmp = Q * (tmp6 - tmp4);
    matr2[p++] = C4 * matr1[5];
    matr2[p++] = C4 * matr1[37];
    matr2[p++] = ((co2 = matr1[16 + 5]) - (co6 = matr1[48 + 5])) << TWO;
    matr2[p++] = C4 * (co2 + co6);
    matr2[p++] = C4Q * tmp4 - tmp;
    matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) << TWO;
    matr2[p++] = C4R * tmp6 - tmp;
    matr2[p++] = C4 * (co17 + co35);

    /* line 6,  M x M */
    matr2[p++] = matr1[6];
    matr2[p++] = matr1[38];
    matr2[p++] = (co2 = matr1[22]) - (co6 = matr1[54]);
    matr2[p] = co2 + co6;
    l1 += (tmp4 = -(co3 = matr1[30]) + (co5 = matr1[46]));
    l3 += tmp4;
    p += 2;
    matr2[p] = (co17 = (co1 = matr1[14]) + (co7 = matr1[62])) - (co35 = co3 + co5);
    l2 += (tmp6 = co1 - co7);
    l0 -= tmp6;
    p += 2;
    matr2[p++] = co17 + co35;

    /* line 7,  M x M */
    tmp4 = (co3 = matr1[24 + 7]) - (co5 = matr1[40 + 7]);
    tmp6 = (co1 = matr1[8 + 7]) - (co7 = matr1[56 + 7]);
    tmp = C6 * (tmp6 - tmp4);
    matr2[p++] = matr1[7] << CONST_BITS_JA;
    matr2[p++] = matr1[32 + 7] << CONST_BITS_JA;
    matr2[p++] = ((co2 = matr1[16 + 7]) - (co6 = matr1[48 + 7])) * C4;
    matr2[p++] = (co2 + co6) << CONST_BITS_JA;
    matr2[p++] = Q * tmp4 - tmp;
    matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4;
    matr2[p++] = R * tmp6 - tmp;
    matr2[p++] = (co17 + co35) << CONST_BITS_JA;


    /* completing line 4 and 6,  O = J(NxM) */
    g0 = C4 * (l0 + l1);
    g1 = C4 * (l0 - l1);
    g2 = l2 << TWO;
    g3 = l3 << TWO;

    matr2[36] = g0 + g2;
    matr2[38] = g1 + g3;
    matr2[52] = g1 - g3;
    matr2[54] = g2 - g0;

    tmp = C6 * (matr2[32] + matr2[48]);
    matr2[32] = -Q * matr2[32] - tmp;
    matr2[48] = R * matr2[48] - tmp;

    tmp = C6 * (matr2[33] + matr2[49]);
    matr2[33] = -Q * matr2[33] - tmp;
    matr2[49] = R * matr2[49] - tmp;

    tmp = Q * (matr2[34] + matr2[50]);
    matr2[34] = -C4Q * matr2[34] - tmp;
    matr2[50] = C4R * matr2[50] - tmp;

    tmp = C6 * (matr2[35] + matr2[51]);
    matr2[35] = -Q * matr2[35] - tmp;
    matr2[51] = R * matr2[51] - tmp;

    tmp = Q * (matr2[37] + matr2[53]);
    matr2[37] = -C4Q * matr2[37] - tmp;
    matr2[53] = C4R * matr2[53] - tmp;

    tmp = C6 * (matr2[39] + matr2[55]);
    matr2[39] = -Q * matr2[39] - tmp;
    matr2[55] = R * matr2[55] - tmp;

    for (p = 0; p < 64; p += 8) {

	matr1[p] = (tmp4 = (n3 = matr2[p] + matr2[p + 1]) + matr2[p + 3]) + matr2[p + 7];
	matr1[p + 3] = (tmp6 = n3 - matr2[p + 3]) - (tmp7 = matr2[p + 4] -
	(tmp1 = (tmp2 = matr2[p + 6] - matr2[p + 7]) - matr2[p + 5]));
	matr1[p + 4] = tmp6 + tmp7;
	matr1[p + 1] = (tmp3 = (n1 = matr2[p] - matr2[p + 1]) + (n2 = matr2[p + 2] - matr2[p + 3])) + tmp2;
	matr1[p + 5] = (n1 - n2) + tmp1;	// no tmp because of the caching of (n1 -n2)
	matr1[p + 2] = (n1 - n2) - tmp1;
	matr1[p + 6] = tmp3 - tmp2;
	matr1[p + 7] = tmp4 - matr2[p + 7];
    }


    for (p = i = 0; p < 64; p += 8, i++) {

	coeff[p] = ((tmp4 = (n3 = matr1[i] + matr1[8 + i]) + matr1[24 + i]) + matr1[56 + i]) >> ALLBITS;
	coeff[p + 3] = ((tmp6 = n3 - matr1[24 + i]) - (tmp7 = matr1[32 + i] -
	       (tmp1 = (tmp2 = matr1[48 + i] - matr1[56 + i]) - matr1[40 + i]))) >> ALLBITS;
	coeff[p + 4] = (tmp6 + tmp7) >> ALLBITS;
	coeff[p + 1] = ((tmp3 = (n1 = matr1[i] - matr1[8 + i]) +
	    (n2 = matr1[16 + i] - matr1[24 + i])) + tmp2) >> ALLBITS;
	coeff[p + 5] = ((n1 - n2) + tmp1) >> ALLBITS;
	coeff[p + 2] = ((n1 - n2) - tmp1) >> ALLBITS;
	coeff[p + 6] = (tmp3 - tmp2) >> ALLBITS;
	coeff[p + 7] = (tmp4 - matr1[56 + i]) >> ALLBITS;
    }
}


/*
 *--------------------------------------------------------------
 *
 * norm_quant_matrix()
 *
 *  Remember that we're actually do an inverse Discrete Fourier Transformation here.
 *  To get out the DCT-values ritght after the quantization step, we have to work our
 *  conversion constants into the quantisation tables since Quantization means to
 *  multiply the DCT-Result with empirically defined quantization factors.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *--------------------------------------------------------------
 */

void norm_quant_matrix(m1)
int m1[64];
{
    int i;

    for (i = 0; i < 64; i++) {
        m1[i] *= PreNorm[i];
    }
}

/*
 *--------------------------------------------------------------
 *
 * dct_dump()
 *
 *  Prints out an entire DCTBLOCK (64 Elements in 8 x 8 Order).
 *  For debugging purpose only.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *--------------------------------------------------------------
 */

void dct_dump(m1)
int* m1;
{
    int i, j;

    printf("\n");
    for (j = 0; j < 8; j++) {
	for (i = 0; i < 8; i++) {
	    printf("%d\t", (m1[j*8+i]));
	}
	printf("\n");
    }
}

#endif
