/*******************************************************************************
 * Copyright (c) 2008 IGA Tosiki, NTT DATA BUSINESS BRAINS Corp.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    IGA Tosiki (NTT DATA BUSINESS BRAINS Corp.) - initial API and implementation
 *******************************************************************************/
/*
 * blanco Framework
 * Copyright (C) 2008 NTT DATA BUSINESS BRAINS CORPORATION
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 */
package blanco.html.normalizer.util;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;

import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Parser;
import org.ccil.cowan.tagsoup.XMLWriter;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import blanco.html.normalizer.parser.helper.BlancoHtmlDecodeNumericCharacterReferenceContentHandler;
import blanco.html.parser.BlancoHtmlContentSerializer;
import blanco.html.parser.BlancoHtmlParser;
import blanco.html.parser.BlancoHtmlParserFactory;

/**
 * HTML퉻̂߂̃[eBeBNXB
 * 
 * @author IGA Tosiki
 */
public class BlancoHtmlNormalizerUtil {
    /**
     * ^ꂽ HTML 𐳏퉻܂B
     * 
     * @param argHtmlInput
     *            HTMLB
     * @return 퉻 HTMLB
     * @throws IOException
     *             o͗OꍇB
     */
    public static byte[] normalize(final byte[] argHtmlInput)
            throws IOException {
        try {
            final BlancoHtmlParser encodingParser = BlancoHtmlParserFactory
                    .getInstance();
            encodingParser.parse(argHtmlInput);
            String current = new String(argHtmlInput, encodingParser
                    .getEncoding());
            current = normalizeByTagSoup(current);
            current = normalizeNumericCharacterReference(current
                    .getBytes(encodingParser.getEncoding()), encodingParser
                    .getEncoding());

            return current.getBytes(encodingParser.getEncoding());
        } catch (SAXException e) {
            throw new IOException("SAXExceptionO: " + e.toString());
        }
    }

    /**
     * TagSoup 𗘗p HTML𐳏퉻܂B
     * 
     * IvV^O̕⑫ɗLvB
     * 
     * @param argHtmlInput
     *            퉻HTMLB
     * @return 퉻ꂽoHTMLB
     * @throws IOException
     *             o͗OꍇB
     * @throws SAXException
     *             SAXOꍇB
     */
    private static String normalizeByTagSoup(final String argHtmlInput)
            throws IOException, SAXException {
        final XMLReader parser = new Parser();

        final HTMLSchema schema = new HTMLSchema();
        parser.setProperty(Parser.schemaProperty, schema);

        final StringWriter output = new StringWriter();

        final XMLWriter serializer = new XMLWriter(output);
        parser.setContentHandler(serializer);

        // TODO mF_ł́AꂪҒʂɂ͋@\B
        // parser.setDTDHandler(serializer);
        // d̂ŁAȉ̋Lqɂ DOCTYPE Iɏo͂B(htmlp)
        serializer.setOutputProperty(XMLWriter.DOCTYPE_PUBLIC,
                "-//W3C//DTD HTML 4.01 Transitional//EN");

        // <html> ɖOԂ킷tȂ悤ɂB
        parser.setFeature(Parser.namespacesFeature, false);

        final InputSource input = new InputSource();
        input.setCharacterStream(new StringReader(argHtmlInput));

        // o͂ (xhtmlł͂Ȃ) html ɃZbg܂B
        serializer.setOutputProperty(XMLWriter.METHOD, "html");

        // XML錾̏o͂}܂B
        serializer.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");

        // ւ̃ftHgt^}܂B
        parser.setFeature(Parser.defaultAttributesFeature, false);

        // o͐̕GR[fBOw肵܂B
        serializer.setOutputProperty(XMLWriter.ENCODING, "Windows-31J");

        // mȂO̗vfɂĊɏ܂B(jsp΍ɂĕKvƑz)
        parser.setFeature(
                "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons",
                false);

        // Rĝ܂܏o͂܂B
        parser.setProperty(Parser.lexicalHandlerProperty, serializer);

        // p[X{B
        parser.parse(input);

        return output.toString();
    }

    /**
     * lQƂfR[h܂B
     * 
     * @param argHtmlInput
     *            HTMLB
     * @param encoding
     *            GR[fBOB
     * @return oHTMLB
     * @throws IOException
     *             o͗OꍇB
     */
    private static String normalizeNumericCharacterReference(
            final byte[] argHtmlInput, final String encoding)
            throws IOException {
        final BlancoHtmlDecodeNumericCharacterReferenceContentHandler handler = new BlancoHtmlDecodeNumericCharacterReferenceContentHandler();
        final StringWriter strWriter = new StringWriter();
        BlancoHtmlContentSerializer serializer = new BlancoHtmlContentSerializer();
        serializer.setWriter(new BufferedWriter(strWriter));
        handler.setHandler(serializer);
        handler.setEncoding(encoding);

        final BlancoHtmlParser parser = BlancoHtmlParserFactory.getInstance();
        parser.setHandler(handler);

        parser.parse(argHtmlInput);

        return strWriter.toString();
    }
}
