/* Copyright(C) 2004 Brazil

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
#include "senna_in.h"
#include <string.h>
#include <ctype.h>
#include "str.h"
#include "lex.h"

/* ngram */

inline static sen_lex *
sen_ngram_open(sen_sym *sym, const char *str, int addp)
{
  sen_lex *lex;
  if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
  lex->sym = sym;
#ifndef NO_MECAB
  lex->mecab = NULL;
#endif /* NO_MECAB */
  lex->buf = NULL;
  lex->pos = -1;
  lex->skip = 1;
  lex->tail = 0;
  lex->addp = addp;
  lex->status = sen_lex_doing;
  lex->encoding = sym->encoding;
  if (sym->flags & SEN_INDEX_NORMALIZE) {
    if ((lex->nstr = sen_nstr_open(str, lex->encoding,
				   SEN_STR_REMOVEBLANK|SEN_STR_WITH_CTYPES))) {
      str = lex->nstr->norm;
    }
  } else {
    lex->nstr = NULL;
  }
  lex->orig = (unsigned char *)str;
  lex->next = (unsigned char *)str;
  lex->uni_alpha = (lex->nstr && !(lex->sym->flags & SEN_INDEX_SPLIT_ALPHA));
  lex->uni_digit = (lex->nstr && !(lex->sym->flags & SEN_INDEX_SPLIT_DIGIT));
  lex->uni_symbol = (lex->nstr && !(lex->sym->flags & SEN_INDEX_SPLIT_SYMBOL));
  return lex;
}

inline static sen_id
sen_ngram_next(sen_lex *lex)
{
  char *buf;
  sen_id tid;
  sen_sym *sym = lex->sym;
  uint_least8_t *cp = NULL;
  int32_t len = 0, pos;
  const unsigned char *p, *q, *r;
  if (lex->status) { return SEN_SYM_NIL; }
  for (p = lex->next, pos = lex->pos + lex->skip; *p; p = r, pos++) {
    if (lex->nstr) { cp = lex->nstr->ctypes + pos; }
    if (lex->uni_alpha && SEN_NSTR_CTYPE(*cp) == sen_str_alpha) {
      for (len = 1, r = p;;len++) {
	r += sen_str_charlen((char *)r, lex->encoding);
	if (SEN_NSTR_ISBLANK(*cp)) { break; }
	if (!*r || SEN_NSTR_CTYPE(*++cp) != sen_str_alpha) { break; }
      }
      if (!(buf = SEN_MALLOC(r - p + 1))) { return SEN_SYM_NIL; }
      memcpy(buf, p, r - p);
      buf[r - p] = '\0';
      tid = lex->addp ? sen_sym_get(sym, buf) : sen_sym_at(sym, buf);
      SEN_FREE(buf);
      if (tid == SEN_SYM_NIL) {
	lex->status = sen_lex_not_found;
	return SEN_SYM_NIL;
      }
      lex->skip = len;
    } else if (lex->uni_digit && SEN_NSTR_CTYPE(*cp) == sen_str_digit) {
      for (len = 1, r = p;;len++) {
	r += sen_str_charlen((char *)r, lex->encoding);
	if (SEN_NSTR_ISBLANK(*cp)) { break; }
	if (!*r || SEN_NSTR_CTYPE(*++cp) != sen_str_digit) { break; }
      }
      if (!(buf = SEN_MALLOC(r - p + 1))) { return SEN_SYM_NIL; }
      memcpy(buf, p, r - p);
      buf[r - p] = '\0';
      tid = lex->addp ? sen_sym_get(sym, buf) : sen_sym_at(sym, buf);
      SEN_FREE(buf);
      if (tid == SEN_SYM_NIL) {
	lex->status = sen_lex_not_found;
	return SEN_SYM_NIL;
      }
      lex->skip = len;
    } else if (lex->uni_symbol && SEN_NSTR_CTYPE(*cp) == sen_str_symbol) {
      for (len = 1, r = p;;len++) {
	r += sen_str_charlen((char *)r, lex->encoding);
	if (SEN_NSTR_ISBLANK(*cp)) { break; }
	if (!*r || SEN_NSTR_CTYPE(*++cp) != sen_str_symbol) { break; }
      }
      if (!(buf = SEN_MALLOC(r - p + 1))) { return SEN_SYM_NIL; }
      memcpy(buf, p, r - p);
      buf[r - p] = '\0';
      tid = lex->addp ? sen_sym_get(sym, buf) : sen_sym_at(sym, buf);
      SEN_FREE(buf);
      if (tid == SEN_SYM_NIL) {
	lex->status = sen_lex_not_found;
	return SEN_SYM_NIL;
      }
      lex->skip = len;
    } else {
#ifdef PRE_DEFINED_UNSPLIT_WORDS
      {
	const unsigned char *key = NULL;
	if ((tid = sen_sym_common_prefix_search(sym, p))) {
	  key = _sen_sym_key(sym, tid);
	  len = sen_str_len(key, lex->encoding, NULL);
	}
	r = p + sen_str_charlen(p, lex->encoding);
	if (tid && (len > 1 || !*r)) {
	  if (*r && pos + len - 1 <= lex->tail) { continue; }
	  p += strlen(key);
	  if (!*p && !lex->addp) { lex->status = sen_lex_done; }
	}
      }
#endif /* PRE_DEFINED_UNSPLIT_WORDS */
      r = p + sen_str_charlen((char *)p, lex->encoding);
      {
	int blankp = 0;
	for (len = 1, q = r; len < SEN_LEX_NGRAM_UNIT_SIZE; len++) {
	  if (cp) {
	    if (SEN_NSTR_ISBLANK(*cp)) { blankp++; break; }
	    cp++;
	  }
	  if (!*q ||
	      (lex->uni_alpha && SEN_NSTR_CTYPE(*cp) == sen_str_alpha) ||
	      (lex->uni_digit && SEN_NSTR_CTYPE(*cp) == sen_str_digit) ||
	      (lex->uni_symbol && SEN_NSTR_CTYPE(*cp) == sen_str_symbol)) {
	    break;
	  }
	  q += sen_str_charlen((char *)q, lex->encoding);
	}
	if (blankp && !lex->addp) { continue; }
      }
      if (!*q && !lex->addp) { lex->status = sen_lex_done; }
      if (!(buf = SEN_MALLOC(q - p + 1))) { return SEN_SYM_NIL; }
      memcpy(buf, p, q - p);
      buf[q - p] = '\0';
      tid = lex->addp ? sen_sym_get(sym, buf) : sen_sym_at(sym, buf);
      SEN_FREE(buf);
      if (tid == SEN_SYM_NIL) {
	lex->status = sen_lex_not_found;
	return SEN_SYM_NIL;
      }
      lex->skip = 1;
    }
    lex->pos = pos;
    lex->len = len;
    lex->tail = pos + len - 1;
    lex->next = r;
    // printf("tid=%d pos=%d tail=%d (%s) %s\n", tid, lex->pos, lex->tail, _sen_sym_key(sym, tid), r);
    // printf("tid=%d pos=%d tail=%d (%s)\n", tid, lex->pos, lex->tail, _sen_sym_key(sym, tid));
    if (!*r) { lex->status = sen_lex_done; }
    return tid;
  }
  lex->status = sen_lex_done;
  return SEN_SYM_NIL;
}

/* mecab */

#ifndef NO_MECAB

static mecab_t *sole_mecab;
sen_mutex sole_mecab_lock;

inline static sen_lex *
sen_mecab_open(sen_sym *sym, const char *str, int addp)
{
  unsigned int bufsize, maxtrial = 10, len;
  char *buf, *s, *p;
  sen_lex *lex;
  if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
  lex->sym = sym;
  // sen_log("(%s)", str);
  lex->mecab = sole_mecab;
  lex->buf = NULL;
  // if (!(lex->mecab = mecab_new3())) {
  lex->pos = -1;
  lex->offset = 0;
  lex->len = 0;
  lex->addp = addp;
  lex->status = sen_lex_doing;
  lex->encoding = sym->encoding;
  if (sym->flags & SEN_INDEX_NORMALIZE) {
    if ((lex->nstr = sen_nstr_open(str, lex->encoding, 0))) {
      str = lex->nstr->norm;
    }
  } else {
    lex->nstr = NULL;
  }
  len = (unsigned int)strlen(str);
  for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
    buf = SEN_MALLOC(bufsize + 1);
    MUTEX_LOCK(sole_mecab_lock);
    s = mecab_sparse_tostr3(lex->mecab, (char *)str, len, buf, bufsize);
    MUTEX_UNLOCK(sole_mecab_lock);
    if (s) { break; }
    SEN_FREE(buf);
  }
  if (!maxtrial) {
    sen_log("mecab_sparse_tostr failed len=%d bufsize=%d", len, bufsize);
    sen_lex_close(lex);
    return NULL;
  }
  for (p = buf + strlen(buf) - 1; *p == '\n' || isspace(*p); p--) { *p = '\0'; }
  //sen_log("sparsed='%s'", s);
  lex->orig = (unsigned char *)str;
  lex->buf = (unsigned char *)buf;
  lex->next = (unsigned char *)buf;
  return lex;
}

inline static sen_id
sen_mecab_next(sen_lex *lex)
{
  char *buf;
  sen_id tid;
  sen_sym *sym = lex->sym;
  uint32_t size;
  int32_t len, offset = lex->offset + lex->len;
  const unsigned char *p;
  if (lex->status) { return SEN_SYM_NIL; }
  for (p = lex->next, len = 0; *p;) {
    p += sen_str_charlen((char *)p, lex->encoding);
    len++;
    if (isspace(*p)) { break; }
  }
  if (!len) {
    lex->status = sen_lex_done;
    return SEN_SYM_NIL;
  }
  size = (uint32_t)(p - lex->next);
  if (!(buf = SEN_MALLOC(size + 1))) { return SEN_SYM_NIL; }
  memcpy(buf, lex->next, size);
  buf[size] = '\0';
  // sen_log("size=%d (%s)", size, buf);
  tid = lex->addp ? sen_sym_get(sym, buf) : sen_sym_at(sym, buf);
  SEN_FREE(buf);
  if (tid == SEN_SYM_NIL) {
    lex->status = sen_lex_not_found;
  } else {
    while (isspace(*p)) { p++; }
    lex->next = p;
    lex->offset = offset;
    lex->len = len;
    if (!*p) { lex->status = sen_lex_done; }
  }
  lex->pos++;
  return tid;
}

#endif /* NO_MECAB */

/* delimited */

inline static sen_lex *
sen_delimited_open(sen_sym *sym, const char *str, int addp)
{
  sen_lex *lex;
  if (!(lex = SEN_MALLOC(sizeof(sen_lex)))) { return NULL; }
  lex->sym = sym;
#ifndef NO_MECAB
  lex->mecab = NULL;
#endif /* NO_MECAB */
  lex->buf = NULL;
  lex->pos = -1;
  lex->skip = 1;
  lex->tail = 0;
  lex->addp = addp;
  lex->status = sen_lex_doing;
  lex->encoding = sym->encoding;
  if (sym->flags & SEN_INDEX_NORMALIZE) {
    if ((lex->nstr = sen_nstr_open(str, lex->encoding, 0))) {
      str = lex->nstr->norm;
    }
  } else {
    lex->nstr = NULL;
  }
  lex->orig = (unsigned char *)str;
  while (isspace(*str)) { str++; }
  lex->next = (unsigned char *)str;
  lex->offset = 0;
  lex->len = 0;
  if (!*str) { lex->status = sen_lex_done; }
  return lex;
}

inline static sen_id
sen_delimited_next(sen_lex *lex)
{
  char *buf;
  sen_id tid;
  sen_sym *sym = lex->sym;
  uint32_t size;
  int32_t len, offset = lex->offset + lex->len;
  const unsigned char *p;
  if (lex->status) { return SEN_SYM_NIL; }
  for (p = lex->next, len = 0; *p;) {
    p += sen_str_charlen((char *)p, lex->encoding);
    len++;
    if (isspace(*p)) { break; }
  }
  if (!len) {
    lex->status = sen_lex_done;
    return SEN_SYM_NIL;
  }
  size = (uint32_t)(p - lex->next);
  if (!(buf = SEN_MALLOC(size + 1))) { return SEN_SYM_NIL; }
  memcpy(buf, lex->next, size);
  buf[size] = '\0';
  // sen_log("size=%d (%s)", size, buf);
  tid = lex->addp ? sen_sym_get(sym, buf) : sen_sym_at(sym, buf);
  SEN_FREE(buf);
  if (tid == SEN_SYM_NIL) {
    lex->status = sen_lex_not_found;
  } else {
    while (isspace(*p)) { p++; }
    lex->next = p;
    lex->offset = offset;
    lex->len = len;
    if (!*p) { lex->status = sen_lex_done; }
  }
  lex->pos++;
  return tid;
}

/* external */

sen_rc
sen_lex_init(void)
{
#ifndef NO_MECAB
  char *arg[] = {"", "-Owakati"};
  // return mecab_load_dictionary(2, arg) ? sen_success : sen_other_error;
  if (!(sole_mecab = mecab_new(2, arg))) { return sen_other_error; }
  MUTEX_INIT(sole_mecab_lock);
#endif /* NO_MECAB */
  return sen_success;
}

sen_rc
sen_lex_fin(void)
{
#ifndef NO_MECAB
  mecab_destroy(sole_mecab);
  MUTEX_DESTROY(sole_mecab_lock);
#endif /* NO_MECAB */
  return sen_success;
}

sen_lex *
sen_lex_open(sen_sym *sym, const char *str, int addp)
{
  switch ((sym->flags & SEN_INDEX_TOKENIZER_MASK)) {
  case SEN_INDEX_MORPH_ANALYSE :
#ifdef NO_MECAB
    return NULL;
#else /* NO_MECAB */
    return sen_mecab_open(sym, str, addp);
#endif /* NO_MECAB */
  case SEN_INDEX_NGRAM :
    return sen_ngram_open(sym, str, addp);
  case SEN_INDEX_DELIMITED :
    return sen_delimited_open(sym, str, addp);
  default :
    return NULL;
  }  
}

sen_rc
sen_lex_next(sen_lex *lex)
{
  switch ((lex->sym->flags & SEN_INDEX_TOKENIZER_MASK)) {
  case SEN_INDEX_MORPH_ANALYSE :
#ifdef NO_MECAB
    return sen_invalid_argument;
#else /* NO_MECAB */
    return sen_mecab_next(lex);
#endif /* NO_MECAB */
  case SEN_INDEX_NGRAM :
    return sen_ngram_next(lex);
  case SEN_INDEX_DELIMITED :
    return sen_delimited_next(lex);
  default :
    return sen_invalid_argument;
  }  
}

sen_rc
sen_lex_close(sen_lex *lex)
{
  if (!lex) { return sen_invalid_argument; }
  if (lex->nstr) { sen_nstr_close(lex->nstr); }
  // if (lex->mecab) { mecab_destroy(lex->mecab); }
  if (lex->buf) { SEN_FREE(lex->buf); }
  SEN_FREE(lex);
  return sen_success;
}
