/* colour.c (C) nejik 2003 */

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/*
 * References:
 * gimp-1.2.3        (http://www.gimp.org)
 * xvidcore-0.9.1    (http://xvid.org)
 *
 */


#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <inttypes.h>

#include "util.h"
#include "csp.h"

static int Width;
static int Height;
static int csp = CSP_UNKNOWN;

static int Hue = 0;
//static int Saturation = 100;
//static int Lightness = 100;
static int Bright = 0;
static int Contrast = 100;
static int Color = 100;

static void (*yv12_trans)(uint8_t *dst, uint8_t *src);


#ifdef ARCH_X86

#include "attributes.h"
#include "mmx.h"
#include "mm_accel.h"

static mmx_t mbright;
static mmx_t mcontrast;
static mmx_t mcolor;
static mmx_t mhilim;
static mmx_t mlolim;
static mmx_t mb128;

#endif /* ARCH_X86 */

static void
rgb_to_hls(int R, int G, int B, int *H, int *L, int *S)
{
	int max, min, delta;
	double h, l, s;

	if (R>G) {
		max = R;
		min = G;
	} else {
		max = G;
		min = R;
	}
	if (B > max)
		max = B;
	else if (B < min)
		min = B;

	l = (max+min) / 2;

	if (max == min) {
		h = 0.0;
		s = 0.0;
	} else {
		delta = (max-min);

		if (l < 128)
			s = 255 * (double)delta / (double)(max+min);
		else
			s = 255 * (double)delta / (double)(511-max-min);

		if (R == max)
			h = (G-B) / (double)delta;
		else if (G == max)
			h = 2 + (B-R) / (double)delta;
		else
			h = 4 + (R-G) / (double)delta;

		h = h * 42.5;
		if (h < 0)
			h += 255;
		else if (h > 255)
			h -= 255;
	}

	*H = h;
	*L = l;
	*S = s;
	return;
}

static int
hls_value(double m1, double m2, double h)
{
	double val;

	if (h > 255)
		h -= 255;
	else if (h < 0)
		h += 255;
	if (h < 42.5)
		val = m1 + (m2-m1) * (h / 42.5);
	else if (h < 127.5)
		val = m2;
	else if (h < 170)
		val = m1 + (m2 - m1) * ((170 - h) / 42.5);
	else
		val = m1;

	return (int)(val * 255);
}

static void
hls_to_rgb(int H, int L, int S, int *R, int *G, int *B)
{
	double h, l, s;
	double m1, m2;
	int r, g, b;

	h = H;
	l = L;
	s = S;

	if (S == 0) {
		*R = L;
		*G = L;
		*B = L;
		return;
	} else {
		if (L < 128)
			m2 = (l * (255 + s)) / 65025.0;
		else
			m2 = (l + s - (l * s) / 255.0) / 255.0;
		m1 = (l / 127.5) - m2;

		r = hls_value(m1, m2, h + 85);
		g = hls_value(m1, m2, h);
		b = hls_value(m1, m2, h - 85);
	}

	r = (r > 255) ? 255 : (r < 0) ? 0 : r;
	g = (g > 255) ? 255 : (g < 0) ? 0 : g;
	b = (b > 255) ? 255 : (b < 0) ? 0 : b;
	*R = r;
	*G = g;
	*B = b;
}

#define Y_R_IN      0.257
#define Y_G_IN      0.504
#define Y_B_IN      0.098
#define Y_ADD_IN    16

#define U_R_IN      0.148
#define U_G_IN      0.291
#define U_B_IN      0.439
#define U_ADD_IN    128

#define V_R_IN      0.439
#define V_G_IN      0.368
#define V_B_IN      0.071
#define V_ADD_IN    128

#define SCALEBITS_IN  8
#define FIX_IN(x)   ((uint16_t) ((x) * (1L<<SCALEBITS_IN) + 0.5))

static void
rgb_to_yuv(int R, int G, int B, int *Y, int *U, int *V)
{
	int y, u, v;

	y = ((FIX_IN(Y_R_IN) * R + FIX_IN(Y_G_IN) * G + FIX_IN(Y_B_IN) * B) >> SCALEBITS_IN) + Y_ADD_IN;
	u = ((-FIX_IN(U_R_IN) * R - FIX_IN(U_G_IN) * G + FIX_IN(U_B_IN) * B) >> SCALEBITS_IN) + U_ADD_IN;
	v = ((FIX_IN(V_R_IN) * R - FIX_IN(V_G_IN) * G - FIX_IN(V_B_IN) * B) >> SCALEBITS_IN) + V_ADD_IN;
	*Y = y;
	*U = u;
	*V = v;
}

#define RGB_Y_OUT   1.164
#define B_U_OUT     2.018
#define Y_ADD_OUT   16

#define G_U_OUT     0.391
#define G_V_OUT     0.813
#define U_ADD_OUT   128

#define R_V_OUT     1.596
#define V_ADD_OUT   128

#define SCALEBITS_OUT 13
#define FIX_OUT(x)    ((uint16_t) ((x) * (1L<<SCALEBITS_OUT) + 0.5))

int32_t RGB_Y_tab[256];
int32_t B_U_tab[256];
int32_t G_U_tab[256];
int32_t G_V_tab[256];
int32_t R_V_tab[256];

static void
yuv_to_rgb(int Y, int U, int V, int *R, int *G, int *B)
{
	int r, g, b;
	int b_u, g_uv, r_v, rgb_y;

	b_u = B_U_tab[U];
	g_uv = G_U_tab[U] + G_V_tab[V];
	r_v = R_V_tab[V];
	rgb_y = RGB_Y_tab[Y];
	r = (rgb_y + r_v) >> SCALEBITS_OUT;
	g = (rgb_y - g_uv) >> SCALEBITS_OUT;
	b = (rgb_y + b_u) >> SCALEBITS_OUT;
	*R = (r > 255) ? 255 : (r < 0) ? 0 : r;
	*G = (g > 255) ? 255 : (g < 0) ? 0 : g;
	*B = (b > 255) ? 255 : (b < 0) ? 0 : b;
}

static void
hls_trans(int *hu, int *ligh, int *satu)
{
	int h = *hu;
	int s = *satu;
	int l = *ligh;

	h += Hue;
	h = (h < 0) ? (h+256) : (h > 255) ? (h-256) : h;

#if 0
	s *= Saturation;
	s /= 100;
	s = (s < 0) ? 0 : (s > 255) ? 255 : s;

	l = l * Lightness;
	l /= 100;
	l = (l < 0) ? 0 : (l > 255) ? 255 : l;
#endif

	*hu = h;
	*satu = s;
	*ligh = l;
}

static void
yuv_hue_trans(int *y, int *u, int *v)
{
	int r, g, b;
	int h, s, l;

//	if (Hue != 0 || Saturation != 100 || Lightness != 100) {
	yuv_to_rgb(*y, *u, *v, &r, &g, &b);
	rgb_to_hls(r, g, b, &h, &l, &s);

	hls_trans(&h, &l, &s);

	hls_to_rgb(h, l, s, &r, &g, &b);
	rgb_to_yuv(r, g, b, y, u, v);
}

static void
hue_trans_yv12(uint8_t *dst, uint8_t *src)
{
	int W = Width;
	int H = Height;
	int x, y;
	int stride_y = Width * 2;
	int stride_uv = Width/2;
	uint8_t *dy, *du, *dv, *sy, *su, *sv;
	int Y, U, V, u4, v4, tu, tv;

	dy = dst;
	dv = dst + W * H;
	du = dst + (W * H / 4 * 5);
	sy = src;
	if (csp == CSP_YV12) {
	  sv = src + W * H;
	  su = src + (W * H / 4 * 5);
	} else {
	  su = src + W * H;
	  sv = src + (W * H / 4 * 5);
	}
	H /= 2;
	for (y = 0; y < H; y++) {
		for (x = 0; x < W; x+=2) {
			u4 = v4 = 0;
			U = su[x/2];
			V = sv[x/2];

			tu = U;
			tv = V;
			Y = sy[x];
			yuv_hue_trans(&Y, &tu, &tv);
			dy[x] = Y;
			u4 += tu;
			v4 += tv;

			tu = U;
			tv = V;
			Y = sy[x+1];
			yuv_hue_trans(&Y, &tu, &tv);
			dy[x+1] = Y;
			u4 += tu;
			v4 += tv;

			tu = U;
			tv = V;
			Y = sy[x+W];
			yuv_hue_trans(&Y, &tu, &tv);
			dy[x+W] = Y;
			u4 += tu;
			v4 += tv;

			tu = U;
			tv = V;
			Y = sy[x+W+1];
			yuv_hue_trans(&Y, &tu, &tv);
			dy[x+W+1] = Y;
			u4 += tu;
			v4 += tv;

			du[x/2] = u4 / 4;
			dv[x/2] = v4 / 4;
		}
		dy += stride_y;
		dv += stride_uv;
		du += stride_uv;
		sy += stride_y;
		sv += stride_uv;
		su += stride_uv;
	}
}

static void
yv12_trans_c(uint8_t *dst, uint8_t *src)
{
  int W = Width;
  int H = Height;
  int x, y, uv, v;
  int bright = Bright;
  int contrast = Contrast;
  int color = Color;
  uint8_t *dp, *sp;
  
  dp = dst;
  sp = src; 
  if (bright != 0 || contrast != 256) {
    for (y = 0; y < H; y++) { 
      for (x = 0; x < W; x++) {
        v = sp[x];
        v -= 16;
        v *= contrast;
        v >>= 8; 
        v += 16;
        v += bright;
        v = (v>Y_HI_LIMIT) ? Y_HI_LIMIT : (v<Y_LO_LIMIT) ? Y_LO_LIMIT: v;
        dp[x] = v;
      }
      dp += W;
      sp += W;
    }
  } else {
    if (Hue == 0)
      memcpy(dp, sp, W*H);
  }

  if (color != 256) {
    int v_offset = W*H;
    int u_offset = v_offset + (v_offset/4);
    W /= 2;
    H /= 2;
    dp = dst + v_offset;
    sp = src + v_offset;
    for (uv = 2; uv > 0; uv--) {
      for (y = 0; y < H; y++) {
        for (x = 0; x < W; x++) {
          v = sp[x];
          v -= 128;
          v *= color;
          v >>= 8;
          v += 128;
          v = (v>C_HI_LIMIT) ? C_HI_LIMIT : (v<C_LO_LIMIT) ? C_LO_LIMIT: v;
          dp[x] = v;
        }
        dp += W;
        sp += W;
      }
      dp = dst + u_offset;
      sp = src + u_offset;
    }
  } else {
    if (Hue == 0)
      memcpy(dst+(W*H), src+(W*H), (W*H)/2);
  }
}

#ifdef ARCH_X86
static void
yv12_trans_mmxext(uint8_t *dst, uint8_t *src)
{
	int W = Width;
	int H = Height;
	int x, y, uv;
	uint8_t *dp, *sp, *s, *d;

	dp = dst;
	sp = src;
	if (Bright != 0 || Contrast != 256) {
		movq_m2r(mhilim, mm7);
		movq_m2r(mlolim, mm6);
		movq_m2r(mcontrast, mm5);
		movq_m2r(mbright, mm4);
		pxor_r2r(mm3, mm3);
		for (y = 0; y < H; y++) {
			s = sp;
			d = dp;
			for (x = 0; x < W; x+=8) {
				// contrast
				movq_m2r(*s, mm2);
				s += 8;

				psubusb_r2r(mm6, mm2);
				pxor_r2r(mm0, mm0);
				pxor_r2r(mm1, mm1);
				punpcklbw_r2r(mm2, mm0);
				punpckhbw_r2r(mm2, mm1);
				pmulhuw_r2r(mm5, mm0);
				pmulhuw_r2r(mm5, mm1);
				// bright
				paddsw_r2r(mm4, mm0);
				paddsw_r2r(mm4, mm1);
				packuswb_r2r(mm1, mm0);
				paddusb_r2r(mm6, mm0);

				// normalize
				pminub_r2r(mm7, mm0);

				// put result
				movntq_r2m(mm0, *d);
				//movq_r2m(mm0, *d);
				d += 8;
			}
			dp += W;
			sp += W;
		}
	} else {
		if (Hue == 0)
			memcpy(dp, sp, W*H);
	}

	if (Color != 256) {
		int v_offset;
		int u_offset;
		if (csp == CSP_YV12) {
		  v_offset = W*H;
		  u_offset = v_offset + (v_offset/4);
		} else {
		  u_offset = W*H;
		  v_offset = u_offset + (u_offset/4);
		}
		W /= 2;
		H /= 2;
		dp = dst + v_offset;
		sp = src + v_offset;
		movq_m2r(mhilim, mm7);
		movq_m2r(mlolim, mm6);
		movq_m2r(mcolor, mm5);
		movq_m2r(mb128, mm4);
		for (uv = 2; uv > 0; uv--) {
			for (y = 0; y < H; y++) {
				d = dp;
				s = sp;
				for (x = 0; x < W; x+=8) {
					movq_m2r(*s, mm2);
					s += 8;

					psubb_r2r(mm4, mm2);
					pxor_r2r(mm0, mm0);
					pxor_r2r(mm1, mm1);
					punpcklbw_r2r(mm2, mm0);
					punpckhbw_r2r(mm2, mm1);
					pmulhw_r2r(mm5, mm0);
					pmulhw_r2r(mm5, mm1);
					packsswb_r2r(mm1, mm0);
					paddb_r2r(mm4, mm0);

					// normalize
					pmaxub_r2r(mm6, mm0);
					pminub_r2r(mm7, mm0);

					// put result
					movntq_r2m(mm0, *d);
					//movq_r2m(mm0, *d);

					d += 8;
				}
				dp += W;
				sp += W;
			}
			dp = dst + u_offset;
			sp = src + u_offset;
		}
	} else {
		if (Hue == 0)
			memcpy(dst+(W*H), src+(W*H), (W*H)/2);
	}
	emms();
}
#endif /* ARCH_X86 */

static void
colour_trans_yv12(uint8_t *dst, uint8_t *src)
{
	if (Hue != 0) {
		hue_trans_yv12(dst, src);
		src = dst;
	}
	if (Bright != 0 || Contrast != 256 || Color != 256)
		yv12_trans(dst, src);
}

int
colour(uint8_t *dest, uint8_t *src, int hue_flag)
{
  if (csp == CSP_YV12 || csp == CSP_YUV420P || csp == CSP_I420) {
    if (hue_flag)
      hue_trans_yv12(dest, src);
    else
      colour_trans_yv12(dest, src);
  } else {
    fprintf(stderr, "colour: CSP_RGB does not inplement\n");
    return -1;
  }
  return 0;
}

int
colour_init(int width, int height, int bright, int hue, int contrast, int color, int in_csp)
{
  int i;
  double h;
  double t;

#ifdef ARCH_X86
  { uint32_t cpu_accel;
    cpu_accel = mm_accel();
    yv12_trans = yv12_trans_c;
    if (cpu_accel & MM_ACCEL_X86_MMXEXT)
      yv12_trans = yv12_trans_mmxext;
  }
#else
  yv12_trans = yv12_trans_c;
#endif /* ARCH_X86 */

  if (width <= 0 || height <= 0) {
    fprintf(stderr, "colour_init: width height invalid, width %d, height %d\n",
			 	width, height);
    return -1;
  }
  if (in_csp != CSP_YV12 && in_csp != CSP_YUV420P && in_csp != CSP_I420) {
    fprintf(stderr, "colour_init: unsupported color space.\n");
    return -1;
  }

  if (bright == 0 && hue == 0 && contrast == 0 && color == 0) {
    return 0;
  }

  Width = width;
  Height = height;
  csp = in_csp;

	Bright = bright;

	t = 100 + contrast;
	t = t * 256 / 100;
	Contrast = (int)(t+0.5);
	if (Contrast < 0)
		Contrast = 0;

	t = 100 + color;
	t = t * 256 / 100;
	Color = (int)(t+0.5);
	if (Color < 0)
		Color = 0;

#ifdef ARCH_X86
	for (i = 0; i < 8; i++) {
		mhilim.ub[i] = 235;
		mlolim.ub[i] = 16;
		mb128.ub[i] = 128;
	}
	for (i = 0; i < 4; i++) {
		mbright.w[i] = Bright;
		mcontrast.w[i] = Contrast;
		mcolor.w[i] = Color;
	}
#endif /* ARCH_X86 */

	h = 255./359.;
	h *= hue;
	Hue = h;
//	s = 255./100.;
//	s *= Saturation;
//	Saturation = s;
//	l = 255./100.;
//	l *= Lightness;
//	Lightness = l;
  for (i = 0; i < 256; i++) {
    RGB_Y_tab[i] = FIX_OUT(RGB_Y_OUT) * (i - Y_ADD_OUT);
    B_U_tab[i] = FIX_OUT(B_U_OUT) * (i - U_ADD_OUT);
    G_U_tab[i] = FIX_OUT(G_U_OUT) * (i - U_ADD_OUT);
    G_V_tab[i] = FIX_OUT(G_V_OUT) * (i - V_ADD_OUT);
    R_V_tab[i] = FIX_OUT(R_V_OUT) * (i - V_ADD_OUT);
  }

  return 1;
}

