/*
 MeCab -- Yet Another Part-of-Speech and Morphological Analyzer

 $Id: tagger.cpp,v 1.24 2003/04/14 15:18:33 taku-ku Exp $;

 Copyright (C) 2001-2002  Taku Kudo <taku-ku@is.aist-nara.ac.jp>
 All rights reserved.

 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public
 License as published by the Free Software Foundation; either
 version 2 of the License, or (at your option) any later verjsion.

 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Library General Public License for more details.

 You should have received a copy of the GNU Library General Public
 License along with this library; if not, write to the
 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.
*/
#include "viterbi.h"
#include "japanese_tokenizer.h"
#include "common.h"
#include "mutex.h"
#include "param.h"
#include "mecab.h"
#include <stdexcept>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <strstream>

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

namespace MeCab
{
  static const Option long_options[] = 
  {
    { "rcfile",             'r', 0, "FILE", "use FILE as resource file" },
    { "dicdir",             'd', 0, "DIR",  "set DIR as dicdir"                        },
    { "build-all-lattice",  'a', 0, 0,      "build all lattice in result (default no)" },
    { "input-format-type",  'I', 0, "TYPE", "set input format type (line,lattice)"     },
    { "output-format-type", 'O', 0, "TYPE", "set output format type (wakati,none,...)" },
    { "node-format",        'F', 0, "STR",  "use STR as the user-defined node format"  },
    { "bos-format",         'B', 0, "STR",  "use STR as the user-defined bos format"   },
    { "eos-format",         'E', 0, "STR",  "use STR as the user-defined eos format"   },
    { "input-buffer-size",  'b', 0, "INT",  "set input buffer size (default 8192)"     },
    { "output",             'o', 0, "FILE", "set the output file name"                 },
    { "version",            'v', 0, 0,      "show the version and exit."               },
    { "help",               'h', 0, 0,      "show this help and exit."                 },
    { 0, 0, 0, 0 }
  };

  std::ostream& writeNode (const char*, char*, Node*, std::ostream&);

  static std::string getDefaultRc (Param &param)
  {
    std::string rcfile = param.getProfileString ("rcfile");
    if (! rcfile.empty()) return rcfile;

#ifdef HAVE_GETENV
    char *homedir = getenv ("HOME");
    if (homedir) {
      std::string s = MeCab::createFileName (std::string(homedir), ".mecabrc");
      std::ifstream ifs(s.c_str());
      if (ifs) return s;
    }

    char *rcenv = getenv ("MECABRC");
    if (rcenv) return std::string(rcenv);
#endif

#if defined  (_WIN32) && ! defined (__CYGWIN__)
    HKEY hKey;
    char v[1024];
    DWORD vt;
    DWORD size = sizeof (v);
   
    RegOpenKeyEx    (HKEY_CURRENT_USER,"software\\mecab",0,KEY_ALL_ACCESS,&hKey);
    RegQueryValueEx (hKey,"mecabrc",0,&vt,(BYTE *)v,&size);
    RegCloseKey (hKey);
    if (vt == REG_SZ) return std::string (v);
#endif

    return std::string (MECAB_DEFAULT_RC);
  }

  static std::string getDicRc (Param &param, std::string rcpath)
  {
    std::string dicdir = param.getProfileString ("dicdir");
    if (dicdir.empty()) dicdir = "."; // current
    removeFileName (rcpath);
    replaceString (dicdir, "$(rcpath)", rcpath);
    param.setProfile ("dicdir", dicdir.c_str(), true);
    dicdir = createFileName (dicdir, "dicrc");
    return dicdir;
  }

#define TAGGER_INITILIZE  ibuf (0), ibufsize(0),  \
                          tokenizer(0), viterbi (0), mutex (0), \
                          node_format (0), bos_format (0), eos_format (0), ostrs(0)

#define TAGGER_ERROR  std::ostrstream os; \
                      os << "Tagger::open(): " << param.what () << "\n\n" \
                         <<  COPYRIGHT << "\ntry '--help' for more information.\n"; \
                      _what = os.str(); os.freeze(false);

  Tagger::Tagger (): TAGGER_INITILIZE {};

  Tagger::Tagger (int argc, char **argv): TAGGER_INITILIZE
  {
    if (! open (argc, argv)) throw std::runtime_error (_what);
  }

  Tagger::Tagger (const char *arg): TAGGER_INITILIZE
  {
    if (! open (arg)) throw std::runtime_error (_what);
  }

  Tagger::~Tagger() { this->close (); }
   
  const char *Tagger::what () 
  {
    return _what.c_str();
  }

  bool Tagger::open (int argc, char **argv)
  {
    Param param;

    if (! param.open (argc, argv, long_options)) {
      TAGGER_ERROR;
      return false;
    }

    return open (param);
  }

  bool Tagger::open (const char *arg)
  {
    Param param;

    if (! param.open (arg, long_options)) {
      TAGGER_ERROR;
      return false;
    }

    return open (param);
  }
   
  bool Tagger::open (Param &param)
  {
    try {

      close ();

      if (param.getProfileInt ("help")) {
	std::ostrstream _ostrs;
	param.help (_ostrs, long_options);
	std::runtime_error e (_ostrs.str());
	_ostrs.freeze (false);
	throw e;
      }

      if (param.getProfileInt ("version")) {
	std::ostrstream _ostrs;
	param.version (_ostrs, long_options);
	std::runtime_error e (_ostrs.str());
	_ostrs.freeze (false);
	throw e; 
      }

      std::string rcfile = getDefaultRc (param);
      if (! param.load (rcfile.c_str())) {
	TAGGER_ERROR;
	return false;
      }

      std::string dicrcfile = getDicRc (param, rcfile);
      if (! param.load (dicrcfile.c_str())) {
	TAGGER_ERROR;
	return false;
      }

      tokenizer = new JapaneseTokenizer (param);
      viterbi   = new Viterbi           (param, tokenizer);
      ibufsize = _min (MAX_INPUT_BUFFER_SIZE,
		       _max (param.getProfileInt ("input-buffer-size"), MIN_INPUT_BUFFER_SIZE));
      
     // input format style
      std::string istyle = param.getProfileString ("input-format-type");
      _read = &Tagger::readLine;
      if (istyle == "lattice") _read = &Tagger::readLattice;
       
      // output format style
      std::string ostyle = param.getProfileString ("output-format-type");
       _write = &Tagger::writeLattice;

       if (ostyle == "wakati") {
	  _write = &Tagger::writeWakati;
       } else if (ostyle == "none") {
	  _write = &Tagger::writeNone;
       } else if (ostyle == "normal") {
	  _write = &Tagger::writeLattice;
       } else {
	  std::string nfk = "node-format";
	  std::string bfk = "bos-format";
	  std::string efk = "eos-format";

	  if (ostyle != "") {
	     nfk += "-"; nfk += ostyle;
	     bfk += "-"; bfk += ostyle;
	     efk += "-"; efk += ostyle;
	     
	     if (std::string (param.getProfileString (nfk.c_str())) == "") 
		throw std::runtime_error (std::string("Param::open(): Unknown format type [") + ostyle + "]");
	  }

	  node_format = mystrdup (param.getProfileString (nfk.c_str()).c_str());
	  bos_format  = mystrdup (param.getProfileString (bfk.c_str()).c_str());
	  std::string ef = param.getProfileString (efk.c_str());
	  if (ef == "") ef = "EOS\n";
	  eos_format = mystrdup (ef.c_str());

	  if (*node_format != '\0') _write = &Tagger::writeUser;
       }
       
       return true;
    }

    catch (exception &e) {
      close ();
      _what = std::string ("Tagger::open(): ") + e.what ();
      return false;
    }
  }

  int Tagger::parse (int argc, char **argv)
  {
    try {

      Param param;

      if (! param.open (argc, argv, long_options)) {
	TAGGER_ERROR;
	throw std::runtime_error (_what);
      }

      if (param.getProfileInt ("help")) {
	param.help (std::cout, long_options);
	return EXIT_SUCCESS;
      }

      if (param.getProfileInt ("version")) {
	param.version (std::cout, long_options);
	return EXIT_SUCCESS;
      }

      if (! open (param)) throw std::runtime_error (_what);

      std::ostream *ofs = &std::cout;
      std::string outputFileName = param.getProfileString ("output");

      if (! outputFileName.empty()) {
	ofs = new std::ofstream (outputFileName.c_str());
	if (! *ofs) throw std::runtime_error (outputFileName + ", cannot open");
      }
     
      const std::vector <std::string>& rest = param.getRestArg (); 
     
      if (rest.size()) {
	for (unsigned int i = 0; i < rest.size(); i++) {
	  std::ifstream ifs (rest[i].c_str ());
	  if (!ifs) throw std::runtime_error (rest[i] + ", cannot open");
	  while (parse (ifs, *ofs)) {};
	}
      } else {
	while (parse (std::cin, *ofs)) {};
      }
  
      if (ofs != &std::cout) delete ofs;

      return EXIT_SUCCESS;
    }

    catch (std::exception &e) {
      std::cerr << "FATAL: " << e.what () << std::endl;
      return EXIT_FAILURE;
    }
  }

  bool Tagger::close ()
  {
    delete [] ibuf; ibuf = 0;
    delete tokenizer;   tokenizer = 0;
    delete viterbi;     viterbi   = 0;
    delete mutex;       mutex     = 0;
    delete [] node_format; node_format = 0;
    delete [] eos_format;  eos_format  = 0;
    delete [] bos_format;  bos_format  = 0;
    if (ostrs) { ostrs->freeze (false); delete ostrs; }
    return true;
  }

  bool Tagger::lock ()
  {
     if (! mutex) mutex = new Mutex;
     return mutex->lock ();
  }
   
  bool Tagger::unlock ()
  {
     if (! mutex) return false;
     return mutex->unlock ();
  }

  std::ostream& Tagger::write (std::ostream &os, Node *bosNode)
  {
    return (this->*_write)(os, bosNode);  
  }

  std::ostream& Tagger::writeLattice (std::ostream &os, Node *bosNode)
  {
    for (Node *node = bosNode->next; node->next; node = node->next) {
      os.write (node->surface, node->length);
      os << '\t' << node->feature << '\n';
    }
    os << "EOS\n"; 
    return os; 
  }
   
  std::ostream& Tagger::writeWakati (std::ostream &os, Node *bosNode)
  {
    for (Node *node = bosNode->next; node->next; node = node->next) {
      os.write (node->surface, node->length);
      os << ' ';
    }
    os << '\n';
    return os; 
  }

  std::ostream& Tagger::writeNone (std::ostream &os, Node *)
  {
    return os; // do nothing
  }

  std::ostream& Tagger::writeUser (std::ostream &os, Node *bosNode)
  {
    writeNode (bos_format, ibuf, bosNode, os);
    Node *node = 0;
    for (node = bosNode->next; node->next; node = node->next) 
      writeNode (node_format, ibuf, node, os);
    writeNode (eos_format, ibuf, node, os);
    return os;
  }

  std::istream &Tagger::read (std::istream &is)
  {
    if (!ibuf) {
      try {
	ibuf = new char [ibufsize];
      }
      catch (std::exception &e) {
	_what = std::string ("Tagger::parse (): ") + e.what ();
	return is;
      }
    }
      
    return (this->*_read)(is);
  }

  std::istream &Tagger::readLine (std::istream &is)
  {
    return is.getline (ibuf, ibufsize);
  }

  std::istream &Tagger::readLattice (std::istream &is)
  {
    int rem = (int)ibufsize;
    char* p = ibuf;
    while (1) {
      is.getline (p, (unsigned int)rem);
      if (!is || ! p[0]) return is;
      unsigned int len = strlen (p);
      p[len] = '\n';
      p   += (len+1);
      rem -= (len+1);
      if (rem <= 0) return is;
    }

    return is;
  }

  bool Tagger::parse (std::istream &is, std::ostream &os)
  {
    if (! read (is)) return false;
    Node *bosNode = viterbi->analyze (ibuf, strlen(ibuf));
    if (! bosNode) {
      _what = std::string("Tagger::parse (): ") + viterbi->what ();
      return false;
    }
     
    write (os, bosNode);
    return true;
  }
   
  const char* Tagger::parse (std::istream &is)
  {
    if (! read (is)) return 0;

    Node *bosNode = viterbi->analyze (ibuf, strlen(ibuf));
    if (! bosNode) {
      _what = std::string("Tagger::parse (): ") + viterbi->what ();
      return 0;
    }
    
    if (!ostrs) ostrs = new std::ostrstream ();
    else { ostrs->freeze (false); ostrs->seekp (0, ios::beg); }
    write (*ostrs, bosNode);
    *ostrs << std::ends;
    return const_cast<char *>(ostrs->str ());
  }
   
  const char *Tagger::parse (const char *str, unsigned int len) 
  {
    if (!str) {
       _what = "Tagger::parse (): NULL pointer is given";
       return 0;
    }

    Node *bosNode = viterbi->analyze (str, len ? len : strlen (str));

    if (! bosNode){
      _what = std::string("Tagger::parse (): ") + viterbi->what ();
      return 0;
    }
     
    if (!ostrs) ostrs = new std::ostrstream ();
    else { ostrs->freeze (false); ostrs->seekp (0, ios::beg); }
    write (*ostrs, bosNode);
    *ostrs << std::ends;
    return const_cast<char *>(ostrs->str ());
  }

  const char *Tagger::parse (const char *str, unsigned int len, char *out, unsigned int len2)
  {
    if (!str) {
      _what = "Tagger::parse (): NULL pointer is given";
      return 0;
    }
     
    Node *bosNode =  viterbi->analyze (str, len ? len : strlen (str));
    if (! bosNode) {
      _what = std::string("Tagger::parse (): ") + viterbi->what ();
      return 0;
    }

    std::ostrstream os (out, len2);
    write (os, bosNode);
    os << std::ends;

    if (os.eof ()) {
      _what = "Tagger::parse (): output buffer overflow" ;
      return 0;
    }

    return const_cast<char *>(out);
  }

  Node *Tagger::parseToNode (const char *str, unsigned int len) 
  {
    if (!str) {
       _what = "Tagger::parseToNode (): NULL pointer is given";
       return 0;
    }
     
    Node *bosNode = viterbi->analyze (str, len ? len : strlen (str));
    if (! bosNode) {
      _what = std::string("Tagger::parseToNode (): ") + viterbi->what ();
      return 0;
    }

    return bosNode;
  }
}
