/* Copyright(C) 2006 Brazil

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "senna_in.h"
#include <ctype.h>
#include <string.h>
#include "senna_in.h"
#include "str.h"

#define DEFAULT_WEIGHT 5
#define DEFAULT_DECAYSTEP 2
#define DEFAULT_MAX_INTERVAL 10
#define DEFAULT_SIMILARITY_THRESHOLD 10
#define DEFAULT_TERM_EXTRACT_POLICY 0

typedef struct _cons {
  int8_t type ;
  int8_t op;
  int8_t weight;
  int8_t mode;
  int option;
  struct _cons *cdr;
  union {
    struct {
      const char *start;
      size_t len;
    } token;
    struct {
      struct _cons *car;
      struct _cons **tail;
    } expr;
  } u;
} cons;

struct _sen_query {
  const char *str;
  const char *cur;
  char *buffer;
  size_t max_size;
  sen_sel_operator default_op;
  sen_sel_mode default_mode;
  int escalation_threshold;
  int escalation_decaystep;
  int weight_offset;
  sen_encoding encoding;
  cons *expr;
  int max_exprs;
  int cur_expr;
  cons cons_pool[1]; /* dummy */
};

typedef enum {
  cons_token = 0,
  cons_expr
} cons_type;

inline static cons *
cons_new(sen_query *q)
{
  if (q->cur_expr <= q->max_exprs) {
    cons *c = &q->cons_pool[q->cur_expr++];
    c->cdr = NULL;
    return c;
  }
  return NULL;
}

inline static void
cons_del(sen_query *q)
{
  if (q->cur_expr > 0) { q->cur_expr--; }
}

inline static cons *
expr_new(sen_query *q)
{
  cons *c;
  if ((c = cons_new(q))) {
    c->type = cons_expr;
    c->u.expr.car = NULL;
    c->u.expr.tail = &c->u.expr.car;
    return c;
  }
  return NULL;
}

inline static cons *
token_new(sen_query *q)
{
  cons *c;
  if ((c = cons_new(q))) {
    c->type = cons_token;
    return c;
  }
  return NULL;
}

inline static void
nconc(cons *expr, cons *c)
{
  if (expr->type == cons_expr) {
    *expr->u.expr.tail = c;
    expr->u.expr.tail = &c->cdr;
  }
}

static cons *get_expr(sen_query *q);

inline static void
skip_space(sen_query *q)
{
  while (*q->cur && sen_isspace(q->cur, q->encoding)) {
    q->cur += sen_str_charlen(q->cur, q->encoding);
  }
}

inline static cons *
get_phrase(sen_query *q)
{
  cons *c;
  const char *start = q->cur, *end;
  for (end = q->cur;; end++) {
    if (!*end) {
      q->cur = end;
      break;
    }
    if (*end == SEN_QUERY_QUOTER) {
      q->cur = end + 1;
      break;
    }
  }
  if (start < end && (c = token_new(q))) {
    size_t len = end - start;
    c->u.token.start = start;
    c->u.token.len = len;
    if (len > q->max_size) { q->max_size = len; }
    return c;
  }
  return NULL;
}

inline static cons *
get_word(sen_query *q, int *prefixp)
{
  cons *c;
  const char *start = q->cur, *end;
  for (end = q->cur;; ) {
    if (!*end ||
	sen_isspace(end, q->encoding) ||
	*end == SEN_QUERY_PARENR) {
      q->cur = end;
      break;
    }
    if (*end == SEN_QUERY_PREFIX) {
      *prefixp = 1;
      q->cur = end + 1;
      break;
    }
    end += sen_str_charlen(end, q->encoding);
  }
  if (start < end && (c = token_new(q))) {
    size_t len = end - start;
    c->u.token.start = start;
    c->u.token.len = len;
    if (len > q->max_size) { q->max_size = len; }
    return c;
  }
  return NULL;
}

inline static void
get_op(sen_query *q, int *mode, int *option)
{
  size_t len;
  char *buf;
  const char *start, *end = q->cur;
  switch (*end) {
  case 'S' :
    *mode = sen_sel_similar;
    start = ++end;
    while (isdigit(*end)) { end++; }
    if ((len = end - start)) {
      buf = SEN_MALLOC(len + 1);
      memcpy(buf, start, len);
      buf[len] = '\0';
      *option = atoi(buf);
      SEN_FREE(buf);
    } else {
      *option = DEFAULT_SIMILARITY_THRESHOLD;
    }
    q->cur = end;
    break;
  case 'N' :
    *mode = sen_sel_near;
    start = ++end;
    while (isdigit(*end)) { end++; }
    if ((len = end - start)) {
      buf = SEN_MALLOC(len + 1);
      memcpy(buf, start, len);
      buf[len] = '\0';
      *option = atoi(buf);
      SEN_FREE(buf);
    } else {
      *option = DEFAULT_MAX_INTERVAL;
    }
    q->cur = end;
    break;
  case 'T' :
    *mode = sen_sel_term_extract;
    start = ++end;
    while (isdigit(*end)) { end++; }
    if ((len = end - start)) {
      buf = SEN_MALLOC(len + 1);
      memcpy(buf, start, len);
      buf[len] = '\0';
      *option = atoi(buf);
      SEN_FREE(buf);
    } else {
      *option = DEFAULT_TERM_EXTRACT_POLICY;
    }
    q->cur = end;
    break;
  }
}

inline static cons *
get_token(sen_query *q)
{
  cons *token = NULL;
  sen_sel_operator op = q->default_op;
  int weight = DEFAULT_WEIGHT, prefixp = 0, mode = -1, option = 0;
  do {
    skip_space(q);
    if (q->cur_expr > q->max_exprs) { return NULL; }
    switch (*q->cur) {
    case '\0' :
      return NULL;
    case SEN_QUERY_PARENR :
      q->cur++;
      return NULL;
    case SEN_QUERY_QUOTEL :
      q->cur++;
      token = get_phrase(q);
      break;
    case SEN_QUERY_PREFIX :
      q->cur++;
      get_op(q, &mode, &option);
      break;
    case SEN_QUERY_AND :
      q->cur++;
      op = sen_sel_and;
      break;
    case SEN_QUERY_BUT :
      q->cur++;
      op = sen_sel_but;
      break;
    case SEN_QUERY_ADJ_INC :
      q->cur++;
      op = sen_sel_adjust;
      if (weight < 127) { weight++; }
      break;
    case SEN_QUERY_ADJ_DEC :
      q->cur++;
      op = sen_sel_adjust;
      if (weight > -128) { weight--; }
      break;
    case SEN_QUERY_ADJ_NEG :
      q->cur++;
      op = sen_sel_adjust;
      weight = -1;
      break;
    case SEN_QUERY_PARENL :
      q->cur++;
      token = get_expr(q);
      break;
    default :
      if ((token = get_word(q, &prefixp)) &&
	  token->u.token.start[0] == 'O' &&
	  token->u.token.start[1] == 'R' &&
	  token->u.token.len == 2) {
	op = sen_sel_or;
	cons_del(q);
	token = NULL;
      }
      
      break;
    }
  } while (!token);
  token->op = op;
  token->weight = weight;
  token->mode = prefixp ? sen_sel_partial : mode;
  token->option = option;
  return token;
}

static cons *
get_expr(sen_query *q)
{
  cons *token, *expr = expr_new(q);
  if (expr) {
    while ((token = get_token(q))) {
      nconc(expr, token);
    }
  }
  return expr;
}

inline static char *
qstr(sen_query *q, cons *c)
{
  if (c->type == cons_token) {
    memcpy(q->buffer, c->u.token.start, c->u.token.len);
    q->buffer[c->u.token.len] = '\0';
    return q->buffer;
  }
  return NULL;
}

inline static void
get_pragma(sen_query *q)
{
  size_t len;
  char *buf;
  const char *start, *end = q->cur;
  while (*end == SEN_QUERY_PREFIX) {
    end++;
    switch (*end) {
    case 'E' :
      start = ++end;
      while (isdigit(*end) || *end == '-') { end++; }
      if ((len = end - start)) {
	buf = SEN_MALLOC(len + 1);
	memcpy(buf, start, len);
	buf[len] = '\0';
	q->escalation_threshold = atoi(buf);
	SEN_FREE(buf);
      }
      if (*end == ',') {
	start = ++end;
	while (*end && *end != SEN_QUERY_PREFIX && !sen_isspace(end, q->encoding)) {
	  end++;
	}
	if ((len = end - start)) {
	  buf = SEN_MALLOC(len + 1);
	  memcpy(buf, start, len);
	  buf[len] = '\0';
	  q->escalation_decaystep = atoi(buf);
	  SEN_FREE(buf);
	}
      }
      q->cur = end;
      break;
    case 'D' :
      start = ++end;
      while (*end && *end != SEN_QUERY_PREFIX && !sen_isspace(end, q->encoding)) {
	end++;
      }
      if (end > start) {
	switch (*start) {
	case 'O' :
	  q->default_op = sen_sel_or;
	  break;
	case SEN_QUERY_AND :
	  q->default_op = sen_sel_and;
	  break;
	case SEN_QUERY_BUT :
	  q->default_op = sen_sel_but;
	  break;
	case SEN_QUERY_ADJ_INC :
	  q->default_op = sen_sel_adjust;
	  break;
	}
      }
      q->cur = end;
      break;
    }
  }
}

sen_query *
sen_query_open(const char *str, sen_sel_operator default_op,
	       int max_exprs, sen_encoding encoding)
{
  sen_query *q;
  if (!(q = SEN_MALLOC(sizeof(sen_query) + max_exprs * sizeof(cons)))) {
    return NULL;
  }
  q->str = str;
  q->cur = str;
  q->max_size = 0;
  q->default_op = default_op;
  q->encoding = encoding;
  q->max_exprs = max_exprs;
  q->cur_expr = 0;
  q->escalation_threshold = sen_query_escalation_threshold;
  q->escalation_decaystep = DEFAULT_DECAYSTEP;
  q->weight_offset = 0;
  get_pragma(q);
  q->expr = get_expr(q);
  q->buffer = SEN_MALLOC(q->max_size + 1);
  return q;
}

const char *
sen_query_rest(sen_query *q)
{
  return q->cur;
}

sen_rc 
sen_query_close(sen_query *q)
{
  if (!q) { return sen_invalid_argument; }
  SEN_FREE(q->buffer);
  SEN_FREE(q);
  return sen_success;
}

static void
exec_query(sen_index *i, sen_query *q, cons *c, sen_records *r, sen_sel_operator op)
{
  int n = sen_records_nhits(r);
  if (!n && op != sen_sel_or) { return; }
  if (c->type == cons_token) {
    sen_select_optarg opt;
    char *str = qstr(q, c);
    opt.mode = c->mode == -1 ? q->default_mode : c->mode;
    opt.similarity_threshold = c->option;
    opt.max_interval = c->option;
    opt.weight_vector = NULL;
    opt.vector_size = c->weight + q->weight_offset;
    opt.func = NULL;
    opt.func_arg = NULL;
    sen_log("mode=%d option=%d w=%d o=%d", opt.mode, c->option, c->weight, q->weight_offset);
    sen_index_select(i, str, r, op, &opt);
  } else {
    cons *token = c->u.expr.car;
    if (token) {
      sen_sel_operator t_op;
      sen_records *s = n ? sen_records_open(sen_rec_document, sen_rec_none, 0) : r;
      for (t_op = sen_sel_or;; t_op = token->op) {
	exec_query(i, q, token, s, t_op);
	if (!(token = token->cdr)) { break; }
      }
      if (n) {
	switch (op) {
	case sen_sel_or :
	  if (!sen_records_union(r, s)) { sen_records_close(s); }
	  break;
	case sen_sel_and :
	  if (!sen_records_intersect(r, s)) { sen_records_close(s); }
	  break;
	case sen_sel_but :
	  if (!sen_records_subtract(r, s)) { sen_records_close(s); }
	  break;
	  /* todo: adjust
	case sen_sel_adjust :
	  break;
	  */
	default :
	  sen_records_close(s);
	  break;
	}
      }
    }
  }
}

sen_rc
sen_query_exec(sen_index *i, sen_query *q, sen_records *r, sen_sel_operator op)
{
  int p;
  if (!i || !q || !r) { return sen_invalid_argument; }
  p = q->escalation_threshold;
  // dump_query(q, q->expr, 0);
  // sen_log("escalation_threshold=%d", p);
  if (p >= 0 || (-p & 1)) {
    q->default_mode = sen_sel_exact;
    exec_query(i, q, q->expr, r, op);
    sen_log("hits(exact)=%d", sen_records_nhits(r));
  }
  if ((p >= 0) ? (p >= sen_records_nhits(r)) : (-p & 2)) {
    q->weight_offset -= q->escalation_decaystep;
    q->default_mode = sen_sel_unsplit;
    exec_query(i, q, q->expr, r, op);
    sen_log("hits(unsplit)=%d", sen_records_nhits(r));
  }
  if ((p >= 0) ? (p >= sen_records_nhits(r)) : (-p & 4)) {
    q->weight_offset -= q->escalation_decaystep;
    q->default_mode = sen_sel_partial;
    exec_query(i, q, q->expr, r, op);
    sen_log("hits(partial)=%d", sen_records_nhits(r));
  }
  return sen_success;
}

#ifdef DEBUG

static void
dump_query(sen_query *q, cons *c, int level)
{
  { int i; for (i = level; i; i--) { putchar(' '); }}
  printf("%d:%d ", c->weight, c->op);
  if (c->type == cons_token) {
    { int i; for (i = level; i; i--) { putchar(' '); }}
    puts(qstr(q, c));
  } else {
    cons *token;
    putchar('\n');
    for (token = c->u.expr.car; token; token = token->cdr) {
      dump_query(q, token, level + 1);
    }
  }
}

#endif /* DEBUG */
