/*
 * Copyright (c) 2006-2007 NTT DATA CORPORATION.
 * All rights reserved.
 */

#include "postgres.h"
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include "fmgr.h"
#include <mb/pg_wchar.h>
#include "pgsenna2.h"
#include "utils/guc.h"

PG_FUNCTION_INFO_V1(pgs2pdftotext1);
PG_FUNCTION_INFO_V1(pgs2pdftotext2);
PG_FUNCTION_INFO_V1(pgs2snippet1);
PG_FUNCTION_INFO_V1(pgs2textporter1);
PG_FUNCTION_INFO_V1(pgs2textporter2);


inline static void
tempfile_unlink(char *path)
{
  if (unlink(path) == -1) {
    elog(ERROR, "pgsenna2: failed to unlink temporary file (%d)", errno);
  }
}

Datum
pgs2pdftotext1(PG_FUNCTION_ARGS)
{
  text *path_a = (text*)PG_GETARG_TEXT_P(0);
  char *path = NULL;
  char command[512];
  int buf_size = 4096;
  int buf_read = 0;
  int buf_read_total = 0;
  FILE *stdout_pdftotext;
  text *filtered;

  path = text2cstr(path_a);
  snprintf(command, sizeof(command), "pdftotext %s -",path);
  command[511] = '\0';
  stdout_pdftotext = popen(command, "r");
  if (stdout_pdftotext == (FILE *)-1) {
    elog(ERROR, "pgsenna2: failed to popen for pdftotext (%d)", errno);
  }
  filtered = palloc(VARHDRSZ + buf_size);
  if (filtered == NULL) {
    elog(ERROR, "pgsenna2: failed to palloc for return value");
  }
  while ((buf_read = fread(VARDATA(filtered) + buf_read_total,
                           sizeof(char), buf_size - buf_read_total,
                           stdout_pdftotext))) {
    if (buf_read == (buf_size - buf_read_total)) {
      buf_size *= 2;
      filtered = repalloc(filtered, VARHDRSZ + buf_size);
      if (filtered == NULL) {
        elog(ERROR, "pgsenna2: failed to palloc for return value");
      }
    }
    buf_read_total += buf_read;
  }
  if (ferror(stdout_pdftotext)) {
    elog(ERROR, "pgsenna2: failed to fread temporary PDFfile");
  }
  if (pclose(stdout_pdftotext) != 0) {
    elog(ERROR, "pgsenna2: failed pdftotext 1 (%d)", errno);
  }
  /* varatt_size include the size of itself */
#ifdef POSTGRES83
  SET_VARSIZE_4B(filtered, VARHDRSZ + buf_read_total);
#else
  VARATT_SIZEP(filtered) = VARHDRSZ + buf_read_total;
#endif
  PG_RETURN_TEXT_P(filtered);
}

Datum
pgs2pdftotext2(PG_FUNCTION_ARGS)
{
  bytea *pdfdata = (bytea*)PG_GETARG_BYTEA_P(0);
  char path[64] = "/tmp/pgs2_XXXXXX";
  int byte_wrote = 0;
  int byte_wrote_total = 0;
  FILE *tempfile;
  int fd = -1;
  char command[512];
  int buf_size = 4096;
  int buf_read = 0;
  int buf_read_total = 0;
  FILE *stdout_pdftotext;
  text *filtered;

#ifndef WIN32
  fd = mkstemp(path);
#endif
  if (fd == -1) {
    elog(ERROR, "pgsenna2: failed mkstemp for temporary PDFfile (%d)", errno);
  }
  close(fd);
  tempfile = fopen(path, "wb");
  if (tempfile == NULL) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to fopen temporary PDFfile (%d)", errno);
  }
  while ((byte_wrote = fwrite(VARDATA(pdfdata) + byte_wrote_total, sizeof(char),
                              VARSIZE(pdfdata) - VARHDRSZ - byte_wrote_total,
                              tempfile))) {
    byte_wrote_total += byte_wrote;
  }
  if (fclose(tempfile) != 0) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to fclose temporary PDFfile (%d)", errno);
  }
  snprintf(command, sizeof(command), "pdftotext %s -", path);
  //  snprintf(command, sizeof(command), "wvWare --charset UTF-8 %s -", path);
  //  snprintf(command, sizeof(command), "ppthtml %s -", path);
  command[511] = '\0';
  stdout_pdftotext = popen(command, "r");
  if (stdout_pdftotext == (FILE *)-1) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to popen for pdftotext (%d)", errno);
  }
  filtered = palloc(VARHDRSZ + buf_size);
  if (filtered == NULL) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to palloc for return value");
  }
  while ((buf_read = fread(VARDATA(filtered) + buf_read_total,
                           sizeof(char), buf_size - buf_read_total,
                           stdout_pdftotext))) {
    if (buf_read == (buf_size - buf_read_total)) {
      buf_size *= 2;
      filtered = repalloc(filtered, VARHDRSZ + buf_size);
      if (filtered == NULL) {
        tempfile_unlink(path);
        elog(ERROR, "pgsenna2: failed to palloc for return value");
      }
    }
    buf_read_total += buf_read;
  }
  if (ferror(stdout_pdftotext)) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to fread temporary PDFfile");
  }
  if (pclose(stdout_pdftotext) != 0) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed pdftotext 2 (%d)", errno);
  }
  /* varatt_size include the size of itself */
#ifdef POSTGRES83
  SET_VARSIZE_4B(filtered, VARHDRSZ + buf_read_total);
#else
  VARATT_SIZEP(filtered) = VARHDRSZ + buf_read_total;
#endif
  tempfile_unlink(path);
  PG_RETURN_TEXT_P(filtered);
}

Datum pgs2snippet1(PG_FUNCTION_ARGS)
{
  sen_rc rc = sen_success;
  int flags = PG_GETARG_INT32(0);
  size_t width = (size_t)PG_GETARG_INT32(1);
  unsigned int max_results = PG_GETARG_INT32(2);
  text *defaultopentag_ = (text*)PG_GETARG_TEXT_P(3);
  char *defaultopentag = text2cstr(defaultopentag_);
  text *defaultclosetag_ = (text*)PG_GETARG_TEXT_P(4);
  char *defaultclosetag = text2cstr(defaultclosetag_);
  sen_snip_mapping *mapping = (sen_snip_mapping *)PG_GETARG_POINTER(5);
  text *keywords_ = (text*)PG_GETARG_TEXT_P(6);
  char *keywords = text2cstr(keywords_);
  text *document_ = (text*)PG_GETARG_TEXT_P(7);
  char *document = text2cstr(document_);
  text *result = NULL;
  unsigned int result_len = 0;
  sen_snip *snip = NULL;
  unsigned int nresults = 0;
  unsigned int max_tagged_len = 0;
  char *tokenp = NULL;
  sen_encoding encoding = sen_enc_default;

  switch (GetDatabaseEncoding()) {
  case PG_UTF8:
    encoding = sen_enc_utf8;
    break;
  case PG_EUC_JP:
    encoding = sen_enc_euc_jp;
    break;
  case PG_SJIS:
    encoding = sen_enc_sjis;
    break;
  default:
    encoding = sen_enc_default;
  }
  snip = sen_snip_open(encoding, flags, width, max_results,
                       defaultopentag, strlen(defaultopentag),
                       defaultclosetag, strlen(defaultclosetag),
                       mapping);
  if (!snip) {
    elog(ERROR, "pgsenna2: sen_snip_open() failed");
  }
  tokenp = strtok(keywords, " ");
  while (tokenp != NULL) {
    rc = sen_snip_add_cond(snip, tokenp, strlen(tokenp), NULL, 0, NULL, 0);
    if (rc != sen_success) {
      elog(ERROR, "pgsenna2: sen_snip_add_cond() failed %d", rc);
    }
    tokenp = strtok(NULL, " ");
  }
  rc = sen_snip_exec(snip, document, strlen(document),
                     &nresults, &max_tagged_len);
  if (rc != sen_success) {
    elog(ERROR, "pgsenna2: sen_snip_exec() failed %d", rc);
  }
  result = palloc(VARHDRSZ + max_tagged_len);
  memset(VARDATA(result), 0, max_tagged_len);
  rc = sen_snip_get_result(snip, 0, VARDATA(result), &result_len);
  if (rc != sen_success) {
    elog(ERROR, "pgsenna2: sen_snip_get_result() failed %d", rc);
  }
  rc = sen_snip_close(snip);
  if (rc != sen_success) {
    elog(ERROR, "pgsenna2: sen_snip_close() failed %d", rc);
  }
  pfree(defaultopentag);
  pfree(defaultclosetag);
  pfree(keywords);
  pfree(document);
#ifdef POSTGRES83
  SET_VARSIZE_4B(result, VARHDRSZ + max_tagged_len);
#else
  VARATT_SIZEP(result) = VARHDRSZ + max_tagged_len;
#endif
  PG_RETURN_TEXT_P(result);
}

#ifdef TEXTPORTER
int _DMC_GetText_V4(unsigned char *pdfpath, unsigned char *path2,
                    char *enc, int flag);

static inline int textporter_error_check(int iRet, char *path2, char *pdffile)
{
  if (iRet != 0) {
    const char *option;
    char textporter_error[64];
    int null_strlen;
    int i = 0;
    option = GetConfigOption("ludia.textporter_error");
    if (option) {
      strncpy(textporter_error, option, 64);
    } else {
      strncpy(textporter_error, TEXTPORTER_ERROR, 64);
    }
    while (textporter_error[i] != '\0') {
      textporter_error[i] = tolower(textporter_error[i]);
      i++;
    }
    if (textporter_error[0] == 'd' &&
        textporter_error[1] == 'e' &&
        textporter_error[2] == 'b' &&
        textporter_error[3] == 'u' &&
        textporter_error[4] == 'g') {
      elog(DEBUG1, "pgsenna2: textporter errorcode = %d, %s", iRet, pdffile);
    } else if (textporter_error[0] == 'l' &&
               textporter_error[1] == 'o' &&
               textporter_error[2] == 'g') {
      elog(LOG, "pgsenna2: textporter errorcode = %d, %s", iRet, pdffile);
    } else if (textporter_error[0] == 'i' &&
               textporter_error[1] == 'n' &&
               textporter_error[2] == 'f' &&
               textporter_error[3] == 'o') {
      elog(INFO, "pgsenna2: textporter errorcode = %d, %s", iRet, pdffile);
    } else if (textporter_error[0] == 'n' &&
               textporter_error[1] == 'o' &&
               textporter_error[2] == 't' &&
               textporter_error[3] == 'i' &&
               textporter_error[4] == 'c' &&
               textporter_error[5] == 'e') {
      elog(NOTICE, "pgsenna2: textporter errorcode = %d, %s", iRet, pdffile);
    } else if (textporter_error[0] == 'w' &&
               textporter_error[1] == 'a' &&
               textporter_error[2] == 'r' &&
               textporter_error[3] == 'n' &&
               textporter_error[4] == 'i' &&
               textporter_error[5] == 'n' &&
               textporter_error[6] == 'g') {
      elog(WARNING, "pgsenna2: textporter errorcode = %d, %s", iRet, pdffile);
    } else {
      tempfile_unlink(path2);
      elog(ERROR, "pgsenna2: textporter errorcode = %d, %s", iRet, pdffile);
      return 1;
    }
    null_strlen = strlen(textporter_error);
    if (textporter_error[null_strlen - 4] == 'n' &&
        textporter_error[null_strlen - 3] == 'u' &&
        textporter_error[null_strlen - 2] == 'l' &&
        textporter_error[null_strlen - 1] == 'l') {
      return 1;
    }
  }
  return 0;
}

Datum
pgs2textporter1(PG_FUNCTION_ARGS)
{
  text *pdfpath_ = (text*)PG_GETARG_TEXT_P(0);
  unsigned char *pdfpath = text2cstr(pdfpath_);  // from
  unsigned char path2[64] = "/tmp/pgs2_XXXXXX";  // to
  FILE *tempfile2;
  int fd = -1;
  int buf_size = 4096;
  int buf_read_total = 0;
  char tmp_str[buf_size];
  text *filtered;
  int iRet;
  char enc[64];

  switch (GetDatabaseEncoding()) {
  case PG_UTF8:
    strcpy(enc, "UTF-8");
    break;
  case PG_EUC_JP:
    strcpy(enc, "EUC-JP");
    break;
  default:
    strcpy(enc, "UTF-8");
  }
#ifndef WIN32
  fd = mkstemp(path2);
#endif
  if (fd == -1) {
    elog(ERROR, "pgsenna2: failed mkstemp for temporary file 2(%d)", errno);
  }
  close(fd);
  {
    const char *option;
    char textporter_log[16];
    option = GetConfigOption("ludia.textporter_log");
    if (option) {
      strncpy(textporter_log, option, 16);
    } else {
      strncpy(textporter_log, TEXTPORTER_LOG, 16);
    }
    if (!strcmp(textporter_log, "on")) {
      iRet = _DMC_GetText_V4(pdfpath, path2, enc, 1);
    } else {
      iRet = _DMC_GetText_V4(pdfpath, path2, enc, 0);
    }
  }
  if (textporter_error_check(iRet, path2, pdfpath) == 1) {
    tempfile_unlink(path2);
    PG_RETURN_NULL();
  }
  filtered = palloc(VARHDRSZ + buf_size);
  if (filtered == NULL) {
    tempfile_unlink(path2);
    elog(ERROR, "pgsenna2: failed to palloc for return value");
  }
  tempfile2 = fopen(path2, "r");
  while (true) {
    if (fgets(tmp_str, buf_size, tempfile2) == NULL) {
      break;
    } else {
      memcpy(VARDATA(filtered) + buf_read_total, tmp_str, strlen(tmp_str));
      buf_read_total += strlen(tmp_str);
      filtered = repalloc(filtered, VARHDRSZ + buf_read_total + buf_size);
    }
  }
  fclose(tempfile2);
  tempfile_unlink(path2);
  /* varatt_size include the size of itself */
  VARATT_SIZEP(filtered) = VARHDRSZ + buf_read_total;
  PG_RETURN_TEXT_P(filtered);
}

Datum
pgs2textporter2(PG_FUNCTION_ARGS)
{
  bytea *pdfdata = (bytea*)PG_GETARG_BYTEA_P(0);
  unsigned char path[64] = "/tmp/pgs2_XXXXXX";
  unsigned char path2[64] = "/tmp/pgs2_XXXXXX";
  int byte_wrote = 0;
  int byte_wrote_total = 0;
  FILE *tempfile;
  FILE *tempfile2;
  int fd = -1;
  int buf_size = 4096;
  int buf_read_total = 0;
  char tmp_str[buf_size];
  text *filtered;
  int iRet;
  char enc[64];

  switch (GetDatabaseEncoding()) {
  case PG_UTF8:
    strcpy(enc, "UTF-8");
    break;
  case PG_EUC_JP:
    strcpy(enc, "EUC-JP");
    break;
  default:
    strcpy(enc, "UTF-8");
  }
#ifndef WIN32
  fd = mkstemp(path);
#endif
  if (fd == -1) {
    elog(ERROR, "pgsenna2: failed mkstemp for temporary file (%d)", errno);
  }
  close(fd);
  tempfile = fopen(path, "wb");
  if (tempfile == NULL) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to fopen temporary file (%d)", errno);
  }
  while ((byte_wrote = fwrite(VARDATA(pdfdata) + byte_wrote_total, sizeof(char),
                              VARSIZE(pdfdata) - VARHDRSZ - byte_wrote_total,
                              tempfile))) {
    byte_wrote_total += byte_wrote;
  }
  if (fclose(tempfile) != 0) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed to fclose temporary file (%d)", errno);
  }
#ifndef WIN32
  fd = mkstemp(path2);
#endif
  if (fd == -1) {
    tempfile_unlink(path);
    elog(ERROR, "pgsenna2: failed mkstemp for temporary file 2(%d)", errno);
  }
  close(fd);
  {
    const char *option;
    char textporter_log[16];
    option = GetConfigOption("ludia.textporter_log");
    if (option) {
      strncpy(textporter_log, option, 16);
    } else {
      strncpy(textporter_log, TEXTPORTER_LOG, 16);
    }
    if (!strcmp(textporter_log, "on")) {
      iRet = _DMC_GetText_V4(path, path2, enc, 1);
    } else {
      iRet = _DMC_GetText_V4(path, path2, enc, 0);
    }
  }
  tempfile_unlink(path);
  if (textporter_error_check(iRet, path2, "") == 1) {
    tempfile_unlink(path2);
    PG_RETURN_NULL();
  }
  filtered = palloc(VARHDRSZ + buf_size);
  if (filtered == NULL) {
    tempfile_unlink(path2);
    elog(ERROR, "pgsenna2: failed to palloc for return value");
  }
  tempfile2 = fopen(path2, "r");
  while (true) {
    if (fgets(tmp_str, buf_size, tempfile2) == NULL) {
      break;
    } else {
      memcpy(VARDATA(filtered) + buf_read_total, tmp_str, strlen(tmp_str));
      buf_read_total += strlen(tmp_str);
      filtered = repalloc(filtered, VARHDRSZ + buf_read_total + buf_size);
    }
  }
  fclose(tempfile2);
  tempfile_unlink(path2);
  /* varatt_size include the size of itself */
  VARATT_SIZEP(filtered) = VARHDRSZ + buf_read_total;
  PG_RETURN_TEXT_P(filtered);
}
#endif   /* TEXTPORTER */
