/* flicker.c */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

#include "csp.h"
#include "util.h"
#include "video.h"
#include "vutil.h"


#ifdef ARCH_X86
#include "attributes.h"
#include "mmx.h"
#include "mm_accel.h"
#endif /* ARCH_X86 */


typedef struct {
  int width;
  int height;
  int csp;
  float eval_per;
  int total_blocks;
  int blocks_per_line;
  int eval_blocks;
  uint16_t *block_sad;
} flicker_t;

static flicker_t flickt;
static flicker_t *flkt = &flickt;

static void (*flicker_line)(unsigned char *src0, unsigned char *src1, int width, short *sad);

static void flicker_line_c(unsigned char *src0, unsigned char *src1, int width, short *sad);
#ifdef ARCH_X86
static void flicker_line_mmx(unsigned char *src0, unsigned char *src1, int width, short *sad);
static void flicker_line_mmxext(unsigned char *src0, unsigned char *src1, int width, short *sad);
#endif /* ARCH_X86 */

void
flicker_quit(void)
{
  if (flkt->block_sad)
    free(flkt->block_sad);
  flkt->block_sad = NULL;
}

int
flicker_init(int width, int height, int csp, int eval_per)
{
  int blk_num;

  if (flkt->block_sad)
    free(flkt->block_sad);

  flkt->width = width;
  flkt->height = height;
  flkt->csp = csp;
  flkt->total_blocks = width / 8 * height / 8;
  blk_num = ((flkt->total_blocks + 4 + 3) / 4) * 4;
  flkt->block_sad = (uint16_t*) mem_align(blk_num * sizeof(uint16_t), 16);
  if (!flkt->block_sad)
    return FAIL;
  flkt->blocks_per_line = width / 8;
  flkt->eval_per = eval_per;
  flkt->eval_blocks = flkt->total_blocks * eval_per / 100;
#ifndef NDEBUG
  printf("flicker_init: total_blocks %d, eval_blocks %d.\n", flkt->total_blocks, flkt->eval_blocks);
#endif

#ifdef ARCH_X86
  { uint32_t cpu_accel;
    cpu_accel = mm_accel();
    flicker_line = flicker_line_c;
    if (cpu_accel & MM_ACCEL_X86_MMX)
      flicker_line = flicker_line_mmx;
    if (cpu_accel & MM_ACCEL_X86_MMXEXT)
      flicker_line = flicker_line_mmxext;
  }
#else
  flicker_line = flicker_line_c;
#endif /* ARCH_X86 */

  return OK;
}

//#if defined ARCH_X86 && (defined USE_MMXEXT || defined USE_MMX)

//#if defined USE_MMX || defined USE_MMXEXT

#ifdef ARCH_X86

static void
flicker_line_mmx(unsigned char *src0, unsigned char *src1, int width, short *sad)
{
  width >>= 5;
  for (; width > 0; width--) {
    movq_m2r (*src0, mm0);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance
    movq_m2r (*src0, mm1);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance
    movq_m2r (*src0, mm2);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance
    movq_m2r (*src0, mm3);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance

    movq_m2r (*src1, mm4);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    movq_r2r (mm4, mm5);                    // 8 pixels defference to mm1
    psubusb_r2r (mm0, mm4);
    psubusb_r2r (mm5, mm0);
    paddusb_r2r (mm4, mm0);

    movq_m2r (*src1, mm4);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    movq_r2r (mm4, mm5);                    // 8 pixels defference to mm1
    psubusb_r2r (mm1, mm4);
    psubusb_r2r (mm5, mm1);
    paddusb_r2r (mm4, mm1);

    movq_m2r (*src1, mm4);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    movq_r2r (mm4, mm5);                    // 8 pixels defference to mm1
    psubusb_r2r (mm2, mm4);
    psubusb_r2r (mm5, mm2);
    paddusb_r2r (mm4, mm2);

    movq_m2r (*src1, mm4);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    movq_r2r (mm4, mm5);                    // 8 pixels defference to mm1
    psubusb_r2r (mm3, mm4);
    psubusb_r2r (mm5, mm3);
    paddusb_r2r (mm4, mm3);

    psllq_i2r (16, mm1);
    por_r2r (mm1, mm0);
    psllq_i2r (16, mm3);
    por_r2r (mm3, mm2);

    psllq_i2r (32, mm2);
    por_r2r (mm2, mm0);

    movq_m2r (*sad, mm4);
    paddw_r2r(mm0, mm4);
    movq_r2m (mm4, *sad);
    sad += 4;
  }
}

static void
flicker_line_mmxext(unsigned char *src0, unsigned char *src1, int width, short *sad)
{
  width >>= 5;
  for (; width > 0; width--) {
    movq_m2r (*src0, mm0);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance
    movq_m2r (*src0, mm1);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance
    movq_m2r (*src0, mm2);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance
    movq_m2r (*src0, mm3);                      // load src0 8 pixels
    src0 += 8;                                  // src0 pointer advance

    psadbw_m2r (*src1, mm0);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    psadbw_m2r (*src1, mm1);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    psadbw_m2r (*src1, mm2);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance
    psadbw_m2r (*src1, mm3);                    // 8 pixels defference to mm1
    src1 += 8;                                  // src1 pointer advance

    psllq_i2r (16, mm1);
    por_r2r (mm1, mm0);
    psllq_i2r (16, mm3);
    por_r2r (mm3, mm2);

    punpckldq_r2r (mm2, mm0);

    movq_m2r (*sad, mm4);
    paddw_r2r(mm0, mm4);
    movq_r2m (mm4, *sad);
    sad += 4;
  }
}

#else
#undef emms
#define emms()
#endif /* ARCH_X86 */

static void
flicker_line_c(unsigned char *src0, unsigned char *src1, int width, short *sad)
{
  int i, j, d, intens;

  for (i = 0; i < width;) {
    intens = 0;
    for (j = 0; j < 8 && i < width; j++, i++) {
      d = src0[i] - src1[i];
      if (d < 0)
	d = -d;
      intens += d;
    }
    *sad += intens;
    sad++;
  }
}

static void inssort(int n, short a[])
{
    int i, j;
    short x;

    for (i = 1; i < n; i++) {
        x = a[i];
        for (j = i - 1; j >= 0 && a[j] > x; j--)
            a[j + 1] = a[j];
        a[j + 1] = x;
    }
}

static void
quicksort(short a[], int first, int last, int tgp)
{
  int i, j;
  short x, t;

  if (last-first < 10) {
    inssort(last-first, &a[first]);
    return;
  }

  x = a[(first+last) / 2];
  i = first; j = last;
  for (;;) {
    while (a[i] < x) i++;
    while (x < a[j]) j--;
    if (i >= j) break;
    t = a[i]; a[i] = a[j]; a[j] = t;
    i++; j--;
  }
//  if (first < i-1) qsort(a, first, i-1);
//  if (j+1 < last) qsort(a, j+1, last);
  if (tgp < i-1)      quicksort(a, first, i-1, tgp);
  else if (j+1 < tgp) quicksort(a, j+1, last, tgp);
}

static inline unsigned int
calc_intens(short *sad, int sad_num, int eval_num)
{
  int i;
  unsigned int intensity = 0;

  quicksort(sad, 0, sad_num, sad_num - eval_num);

  for (i = sad_num - eval_num; i < sad_num; i++)
    intensity += sad[i];
  return intensity;
}

unsigned int
flicker(unsigned char *bottom, unsigned char *top)
{
  unsigned int intensity = 0;
  int y;
  int src_stride;
  int block_stride;
  int w = flkt->width;
  int h = flkt->height;
  int i;
  short *sad = flkt->block_sad;

  switch (flkt->csp) {
    case CSP_YV12:
    case CSP_YUV420P:
    case CSP_I420:
      break;
    case CSP_RGB24:
      w *= 3;
      break;
  }

  bottom += w;
  src_stride = w * 2;
  block_stride = flkt->blocks_per_line;

  h--;

  for (y = 0; y < h;) {
    int j;
    //memset(sad, 0, block_stride * sizeof(short));
    for (j = 0; j < block_stride; j++)
      sad[j] = 0;
    for (i = 0; i < 8 && y < h; i++, y++) {
      if (y & 1) {
        flicker_line(bottom, top, w, sad);
        bottom += src_stride;
      } else {
        flicker_line(top, bottom, w, sad);
        top += src_stride;
      }
    }
    sad += block_stride;
  }

  intensity = calc_intens(flkt->block_sad, flkt->total_blocks, flkt->eval_blocks);

  emms();
#if 0
  {static int p = 0;
  printf("p %4d: intensity %u\n", p++, intensity);
  }
#endif
  return intensity;
}

