/** @file
 */
#if defined(HAVE_CONFIG_H)
#  include "../../config.h"
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <cstring>
#include <cerrno>
#include <utility>
#include <algorithm>

#include <boost/bind.hpp>

#include <glib/gmessages.h>
#include <cabin.h>
#include "html_filter.hpp"


namespace gdestraier {
  namespace builder {
    namespace filter {

      from_html::from_html()
      {
      }

      from_html::~from_html()
      {
      }


      factory::extention_map_type from_html::extentions_[] = {
        { "htm", "text/html" },
        { "html", "text/html" },
        { "xht", "text/xhtml+xml" },
        { "xhtml", "text/xhtml+xml" },
        { 0, 0 }
      };

      factory const& from_html::get_factory() {
        static factory f("HTML", &from_html::create, from_html::extentions_);
        return f;
      }

      abstract_filter* from_html::create() { return new from_html; }





      bool from_html::operator()(hyperestraier::local_document* doc,
                                 gdestraier::model::index_type const& index,
                                 ::GnomeVFSURI* uri,
                                 char const* text_uri,
                                 ::GnomeVFSFileInfo* info,
                                 char const* mime_type) const
      {
        doc->create();
        doc->set_attr(ESTDATTRTYPE, (mime_type? mime_type : "text/html"));

        if (info->size == 0) return true;

        ::GnomeVFSFileSize bytes_read;
        void* content = load_file_content(uri, info->size, &bytes_read);
        if (content == 0) return false;


        //
        // 文字コードを変換する
        //
        gdestraier::model::encoding const* encoding = index.fathom_encoding(static_cast<char const*>(content),
                                                                            static_cast<char const*>(content) + bytes_read);
        doc->set_attr("encoding", encoding->id_);

        if (! encoding->is_utf8_compatible_) {
          // UTF8互換エンコーディングで無いので変換する
          int utf8len = 0;
          char* utf8text = ::est_iconv(static_cast<char const*>(content), bytes_read, encoding->id_, "UTF-8", &utf8len, 0);

          std::free(content);
          if (utf8text == 0) return true; // 変換に失敗したけど、コンテンツ無しで登録する

          bytes_read = utf8len;
          content = utf8text;
        }

        parse_html(static_cast<char const*>(content), static_cast<char const*>(content) + std::size_t(bytes_read), index, doc);
        std::free(content);
        return true;
      }



      /**
       * HTMLを解析して文書オブジェクトを設定します。
       */
      bool from_html::parse_html(char const* first,
                                 char const* last,
                                 gdestraier::model::index_type const& index,
                                 hyperestraier::local_document* doc) const
      {
        if (first == last) return false;

        struct state_type {
          enum { OUTSIDE, TITLE, META, HTML, IGNORE };
          int state;

          typedef std::pair<char const*, char const*> const_substr_type;

          const_substr_type meta_name_;
          const_substr_type meta_content_;
          const_substr_type html_lang_;

          std::list<char*> paragraph_;

          state_type() : state(OUTSIDE) { }


          static bool tagcmp(char const* first, char const* last,
                             char const* right) {
            while (*right != '\0' && first < last && std::toupper(*first) == std::toupper(*right)) {
              first++; right++;
            }
            return *right == '\0';
          }


          char* decode(char const* first, char const* last) {
            char const* p = first;

            char* str = (char*)std::malloc(last - first + 1); // NOTE: 元の長さより長くならないと仮定
            char* pstr = str;


            while (p < last) {
              if (*p != '&')
                *pstr++ = *p++;
              else {
                struct entity_type {
                  char const* name_;
                  char const* decode_;
                };
                static entity_type entities[] = {
                  // Basic symbols
                  { "amp",  "&" }, { "lt", "<" }, { "gt", ">" }, { "quot", "\"" }, { "apos", "'" },
                  // ISO-8859-1
                  { "nbsp", " " },         { "iexcl", "\xc2\xa1" },  { "cent", "\xc2\xa2" },
                  { "pound", "\xc2\xa0" }, { "curren", "\xc2\xa4" }, { "yen", "\xc2\xa5" },
                  { "brvbar", "\xc2\xa6"}, { "sect", "\xc2\xa7" },   { "uml", "\xc2\xa8" },
                  { "copy", "\xc2\xa9"  }, { "ordf", "\xc2\xaa" },   { "laquo", "\xc2\xab" },
                  { "not", "\xc2\xac" },   { "shy", "\xc2\xad" },    { "reg", "\xc2\xae" },
                  { "macr", "\xc2\xaf" },  { "deg", "\xc2\xb0" },    { "plusmn", "\xc2\xb1" },
                  { "sup2", "\xc2\xb2"},   { "sup3", "\xc2\xb3"},    { "acute", "\xc2\xb4" },
                  { "micro", "\xc2\xb5"},  { "para", "\xc2\xb6"},    { "middot", "\xc2\xb7" },
                  { "cedil", "\xc2\xb8"},  { "sup1", "\xc2\xb9"},    { "ordm", "\xc2\xba" },
                  { "raquo", "\xc2\xbb"},  { "frac14", "\xc2\xbc" }, { "frac12", "\xc2\xbd" },
                  { "frac34", "\xc2\xbe"}, { "iquest", "\xc2\xbf" }, { "Agrave", "\xc3\x80" },
                  { "Aacute", "\xc3\x81"}, { "Acirc", "\xc3\x82" },  { "Atilde", "\xc3\x83" },
                  { "Auml", "\xc3\x84"},   { "Aring", "\xc3\x85" },  { "AElig", "\xc3\x86" },
                  { "Ccedil", "\xc3\x87"}, { "Egrave", "\xc3\x88" }, { "Eacute", "\xc3\x89" },
                  { "Ecirc", "\xc3\x8a"},  { "Euml", "\xc3\x8b" },   { "Igrave", "\xc3\x8c" },
                  { "Iacute", "\xc3\x8d"}, { "Icirc", "\xc3\x8e" },  { "Iuml", "\xc3\x8f" },
                  { "ETH", "\xc3\x90"},    { "Ntilde", "\xc3\x91" }, { "Ograve", "\xc3\x92" },
                  { "Oacute", "\xc3\x93"}, { "Ocirc", "\xc3\x94" },  { "Otilde", "\xc3\x95" },
                  { "Ouml", "\xc3\x96"},   { "times", "\xc3\x97" },  { "Oslash", "\xc3\x98" },
                  { "Ugrave", "\xc3\x99"}, { "Uacute", "\xc3\x9a" }, { "Ucirc", "\xc3\x9b" },
                  { "Uuml", "\xc3\x9c"},   { "Yacute", "\xc3\x9d" }, { "THORN", "\xc3\x9e" },
                  { "szlig", "\xc3\x9f"},  { "agrave", "\xc3\xa0" }, { "aacute", "\xc3\xa1" },
                  { "acirc", "\xc3\xa2"},  { "atilde", "\xc3\xa3" }, { "auml", "\xc3\xa4" },
                  { "aring", "\xc3\xa5"},  { "aelig", "\xc3\xa6" },  { "ccedil", "\xc3\xa7" },
                  { "egrave", "\xc3\xa8"}, { "eacute", "\xc3\xa9" }, { "ecirc", "\xc3\xaa" },
                  { "euml", "\xc3\xab"},   { "igrave", "\xc3\xac" }, { "iacute", "\xc3\xad" },
                  { "icirc", "\xc3\xae"},  { "iuml", "\xc3\xaf" },   { "eth", "\xc3\xb0" },
                  { "ntilde", "\xc3\xb1"}, { "ograve", "\xc3\xb2" }, { "oacute", "\xc3\xb3" },
                  { "ocirc", "\xc3\xb4"},  { "otilde", "\xc3\xb5" }, { "ouml", "\xc3\xb6" },
                  { "divide", "\xc3\xb7"}, { "oslash", "\xc3\xb8" }, { "ugrave", "\xc3\xb9" },
                  { "uacute", "\xc3\xba"}, { "ucirc", "\xc3\xbb" },  { "uuml", "\xc3\xbc" },
                  { "yacute", "\xc3\xbd"}, { "thorn", "\xc3\xbe" },  { "yuml", "\xc3\xbf" },
                  // ISO-10646 
                  { "fnof", "\xc6\x92"},        { "Alpha", "\xce\x91" },      { "Beta", "\xce\x92" },
                  { "Gamma", "\xce\x93"},       { "Delta", "\xce\x94" },      { "Epsilon", "\xce\x95" },
                  { "Zeta", "\xce\x96"},        { "Eta", "\xce\x97" },        { "Theta", "\xce\x98" },
                  { "Iota", "\xce\x99"},        { "Kappa", "\xce\x9a" },      { "Lambda", "\xce\x9b" },
                  { "Mu", "\xce\x9c"},          { "Nu", "\xce\x9d" },         { "Xi", "\xce\x9e" },
                  { "Omicron", "\xce\x9f"},     { "Pi", "\xce\xa0" },         { "Rho", "\xce\xa1" },
                  { "Sigma", "\xce\xa3"},       { "Tau", "\xce\xa4" },        { "Upsilon", "\xce\xa5" },
                  { "Phi", "\xce\xa6"},         { "Chi", "\xce\xa7" },        { "Psi", "\xce\xa8" },
                  { "Omega", "\xce\xa9"},       { "alpha", "\xce\xb1" },      { "beta", "\xce\xb2" },
                  { "gamma", "\xce\xb3"},       { "delta", "\xce\xb4" },      { "epsilon", "\xce\xb5" },
                  { "zeta", "\xce\xb6"},        { "eta", "\xce\xb7" },        { "theta", "\xce\xb8" },
                  { "iota", "\xce\xb9"},        { "kappa", "\xce\xba" },      { "lambda", "\xce\xbb" },
                  { "mu", "\xce\xbc"},          { "nu", "\xce\xbd" },         { "xi", "\xce\xbe" },
                  { "omicron", "\xce\xbf"},     { "pi", "\xcf\x80" },         { "rho", "\xcf\x81" },
                  { "sigmaf", "\xcf\x82"},      { "sigma", "\xcf\x83" },      { "tau", "\xcf\x84" },
                  { "upsilon", "\xcf\x85"},     { "phi", "\xcf\x86" },        { "chi", "\xcf\x87" },
                  { "psi", "\xcf\x88"},         { "omega", "\xcf\x89" },      { "thetasym", "\xcf\x91" },
                  { "upsih", "\xcf\x92"},       { "piv", "\xcf\x96" },        { "bull", "\xe2\x80\xa2" },
                  { "hellip", "\xe2\x80\xa6"},  { "prime", "\xe2\x80\xb2" },  { "Prime", "\xe2\x80\xb3" },
                  { "oline", "\xe2\x80\xbe"},   { "frasl", "\xe2\x81\x84" },  { "weierp", "\xe2\x84\x98" },
                  { "image", "\xe2\x84\x91"},   { "real", "\xe2\x84\x9c" },   { "trade", "\xe2\x84\xa2" },
                  { "alefsym", "\xe2\x84\xb5"}, { "larr", "\xe2\x86\x90" },   { "uarr", "\xe2\x86\x91" },
                  { "rarr", "\xe2\x86\x92"},    { "darr", "\xe2\x86\x93" },   { "harr", "\xe2\x86\x94" },
                  { "crarr", "\xe2\x86\xb5"},   { "lArr", "\xe2\x87\x90" },   { "uArr", "\xe2\x87\x91" },
                  { "rArr", "\xe2\x87\x92"},    { "dArr", "\xe2\x87\x93" },   { "hArr", "\xe2\x87\x94" },
                  { "forall", "\xe2\x88\x80"},  { "part", "\xe2\x88\x82" },   { "exist", "\xe2\x88\x83" },
                  { "empty", "\xe2\x88\x85"},   { "nabla", "\xe2\x88\x87" },  { "isin", "\xe2\x88\x88" },
                  { "notin", "\xe2\x88\x89"},   { "ni", "\xe2\x88\x8b" },     { "prod", "\xe2\x88\x8f" },
                  { "sum", "\xe2\x88\x91"},     { "minus", "\xe2\x88\x92" },  { "lowast", "\xe2\x88\x97" },
                  { "radic", "\xe2\x88\x9a"},   { "prop", "\xe2\x88\x9d" },   { "infin", "\xe2\x88\x9e" },
                  { "ang", "\xe2\x88\xa0"},     { "and", "\xe2\x88\xa7" },    { "or", "\xe2\x88\xa8" },
                  { "cap", "\xe2\x88\xa9"},     { "cup", "\xe2\x88\xaa" },    { "int", "\xe2\x88\xab" },
                  { "there4", "\xe2\x88\xb4"},  { "sim", "\xe2\x88\xbc" },    { "cong", "\xe2\x89\x85" },
                  { "asymp", "\xe2\x89\x88"},   { "ne", "\xe2\x89\xa0" },     { "equiv", "\xe2\x89\xa1" },
                  { "le", "\xe2\x89\xa4"},      { "ge", "\xe2\x89\xa5" },     { "sub", "\xe2\x8a\x82" },
                  { "sup", "\xe2\x8a\x83"},     { "nsub", "\xe2\x8a\x84" },   { "sube", "\xe2\x8a\x86" },
                  { "supe", "\xe2\x8a\x87"},    { "oplus", "\xe2\x8a\x95" },  { "otimes", "\xe2\x8a\x97" },
                  { "perp", "\xe2\x8a\xa5"},    { "sdot", "\xe2\x8b\x85" },   { "lceil", "\xe2\x8c\x88" },
                  { "rceil", "\xe2\x8c\x89"},   { "lfloor", "\xe2\x8c\x8a" }, { "rfloor", "\xe2\x8c\x8b" },
                  { "lang", "\xe2\x8c\xa9"},    { "rang", "\xe2\x8c\xaa" },   { "loz", "\xe2\x97\x8a" },
                  { "spades", "\xe2\x99\xa0"},  { "clubs", "\xe2\x99\xa3" },  { "hearts", "\xe2\x99\xa5" },
                  { "diams", "\xe2\x99\xa6"},   { "OElig", "\xc5\x92" },      { "oelig", "\xc5\x93" },
                  { "Scaron", "\xc5\xa0"},      { "scaron", "\xc5\xa1" },     { "Yuml", "\xc5\xb8" },
                  { "circ", "\xcb\x86"},        { "tilde", "\xcb\x9c" },      { "ensp", "\xe2\x80\x82" },
                  { "emsp", "\xe2\x80\x83"},    { "thinsp", "\xe2\x80\x89" }, { "zwnj", "\xe2\x80\x8c" },
                  { "zwj", "\xe2\x80\x8d"},     { "lrm", "\xe2\x80\x8e" },    { "rlm", "\xe2\x80\x8f" },
                  { "ndash", "\xe2\x80\x93"},   { "mdash", "\xe2\x80\x94" },  { "lsquo", "\xe2\x80\x98" },
                  { "rsquo", "\xe2\x80\x99"},   { "sbquo", "\xe2\x80\x9a" },  { "ldquo", "\xe2\x80\x9c" },
                  { "rdquo", "\xe2\x80\x9d"},   { "bdquo", "\xe2\x80\x9e" },  { "dagger", "\xe2\x80\xa0" },
                  { "Dagger", "\xe2\x80\xa1" }, { "permil", "\xe2\x80\xb0" }, { "lsaquo", "\xe2\x80\xb9" },
                  { "rsaquo", "\xe2\x80\xba" }, { "euro", "\xe2\x82\xac"},
                  { 0, 0 }
                };
                char const* p_1 = p + 1;
                std::size_t len = last - p_1;
                entity_type* pe;
                for (pe = entities; pe->name_ != 0; pe++) {
                  std::size_t elen = std::strlen(pe->name_);
                  if (len > elen && p_1[elen] == ';' && ::strncasecmp(p_1, pe->name_, elen) == 0) {
                    // 実体参照の名前が一致した
                    char const* pp = pe->decode_;
                    while (*pp) *pstr++ = *pp++;   // デコードした文字列を出力する
                    p += elen + 2;
                    break;
                  }
                }
                if (pe->name_ == 0) {
                  *pstr++ = '&';  // どの要素ともマッチしなければ、通常文字として扱う。
                  p++;
                }
              }
            }

            *pstr = '\0';
            return str;
          }


          void on_attribute(const_substr_type k, const_substr_type v) {
            switch (state) {
            case META:
              if (tagcmp(k.first, k.second, "name") || tagcmp(k.first, k.second, "http-equiv"))
                meta_name_ = v;
              else if (tagcmp(k.first, k.second, "content"))
                meta_content_ = v;
              break;
            case HTML:
              if (tagcmp(k.first, k.second, "lang") || tagcmp(k.first, k.second, "xml:lang") )
                html_lang_ = v;
              break;
            }
          }


          void parse_attribute(char const* first, char const* last) {
            const_substr_type k, v;
            char const* p = first;
            while (*p == '<' || *p == '/' || *p == '?' || *p == '!' || *p == ' ')  p++;
            while (*p > 0x20 && *p != '/' && *p != '>') p++;

            while (p != last) {
              while (p != last && (*p <= 0x20 || *p == '/' || *p == '?' || *p == '>')) p++;

              k.first = p;
              while (p != last && *p > 0x20 && *p != '/' && *p != '>' && *p != '=') p++;
              k.second = p;

              while (p != last && (*p == '=' || *p <= 0x20)) p++;
              if (*p == '"' || *p == '\'') {
                char ch = *p;
                p++;
                v.first = p;
                while (p != last && *p != ch) p++;
                v.second = p;
              } else {
                v.first = p;
                while (p != last && *p > 0x20 && *p != '"' && *p != '\'' && *p != '/' && *p != '>')
                  p++;
                v.second = p;
              }
              if (p != last) p++;
              if (k.first != k.second)  on_attribute(k, v);
            }

          }



          void flush_paragraph(hyperestraier::local_document* doc)
          {
            if (paragraph_.empty()) return ;

            std::size_t len = 0;
            std::list<char*>::iterator i;

            for (i = paragraph_.begin(); i != paragraph_.end(); i++)
              len += ::strlen(*i);

            char* para = (char*)std::malloc(len + 1);
            char* p = para;
            for (i = paragraph_.begin(); i != paragraph_.end(); i++) {
              char const* src = *i;
              while (*src) *p++ = *src++;
              std::free(*i);
            }
            *p = '\0';

            doc->add_text(para);

            paragraph_.clear();
          }



          void on_node(char const* first, char const* last, bool cdata,
                       hyperestraier::local_document* doc) {

            if (cdata || *first != '<') {
              if (state == IGNORE) return ;

              char* str;
              if (cdata) str = ::strndup(first, last - first);
              else       str = decode(first, last);

              switch (state) {
              case TITLE:
                doc->set_attr(ESTDATTRTITLE, str);
                std::free(str);
                break;

              default:
                paragraph_.push_back(str);
                //doc->add_text(str);
                break;
              }

            } else {

                static char const* para_dlm[] = { "h1", "h2", "h3", "h4", "h5", "h6",
                                                  "p>", "p ", "hr",
                                                  "ul", "ol", "dl", "li", "dt", "dd",
                                                  "th", "td", "pre" };

              if (tagcmp(first + 1, last, "title"))
                state = TITLE;
              else if (tagcmp(first + 1, last, "meta")) {
                state = META;
                meta_name_.first = 0;
                meta_content_.first = 0;
                parse_attribute(first, last);
                if (meta_name_.first != 0 && meta_content_.first != 0 &&
                    meta_name_.second - meta_name_.first >= 6 &&
                    ::strncasecmp(meta_name_.first, "author", 6) == 0) {
                  char* str = decode(meta_content_.first, meta_content_.second);
                  doc->set_attr(ESTDATTRAUTHOR, str);
                  std::free(str);
                }
              } else if (tagcmp(first + 1, last, "html")) {
                state = HTML;
                html_lang_.first = 0;
                parse_attribute(first, last);
                if (html_lang_.first != 0) {
                  char* str = decode(html_lang_.first, html_lang_.second);
                  doc->set_attr(ESTDATTRLANG, str);
                  std::free(str);
                }
              } else if (tagcmp(first + 1, last, "style")) {
                state = IGNORE;
              } else if (tagcmp(first + 1, last, "/style")) {
                state = OUTSIDE;
              } else if (std::find_if(&para_dlm[0], &para_dlm[sizeof(para_dlm) / sizeof(para_dlm[0])],
                                      boost::bind(tagcmp, first + 1, last, _1)) != &para_dlm[sizeof(para_dlm) / sizeof(para_dlm[0])]) {
                flush_paragraph(doc);
                state = OUTSIDE;
              } else
                state = OUTSIDE;
            }
          }
        };

        state_type parser;
        char const* p = first;
        char const* text_first = first;
        bool tag = false;
        while (1) {
          if (p == last) {
            if (p > text_first) parser.on_node(text_first, p, false, doc);
            break;
          } else if (! tag && *p == '<') {
            if (p > text_first) parser.on_node(text_first, p, false, doc);

            if (last - p >= 4 && std::memcmp(p + 1, "!--", 3) == 0) {
              for (char const* ep = p; ep < last - 3; ep++) {
                if (std::memcmp(ep, "-->", 3) == 0) {
                  parser.on_node(p, ep + 3, false, doc);
                  p = ep + 2;
                  text_first = p + 1;
                  break;
                }
              }
            } else if (last - p <= 9 && ::strncasecmp(p + 1, "![CDATA[", 8) == 0) {
              for (char const* ep = p + 9; ep < last - 3; ep++) {
                if (ep[0] == ']' && ep[1] == ']' && ep[2] == '>') {
                  parser.on_node(p + 9, ep, true, doc);
                  p = ep + 2;
                  text_first = p + 1;
                  break;
                }
              }
            } else {
              tag = true;
              text_first = p;
            }
          } else if (tag && *p == '>') {
            if (p > text_first) parser.on_node(text_first, p, false, doc);
            tag = false;
            text_first = p + 1;
          }
          p++;
        }

        parser.flush_paragraph(doc);

        return true;
      }


    }
  }
}
