# -*- coding: utf-8 -*-
from wmcommon import *
from wmhttp import *
import html5lib
from html5lib import treebuilders,HTMLParser

def getAttributeNode(node,name):
    """

    HTML parser interface.
    node : target node
    name : attribute name
    """
    if node.hasAttributes():
        for i in range(node.attributes.length):
            attr = node.attributes.item(i)
            if attr.name.upper() == name.upper():
                return attr
    return None

def getNodeText(node):
    """

    Get joined children TEXTNODE.data
    node : target node
    """
    txt = ''
    for child in  node.childNodes:
        if child.nodeType == child.TEXT_NODE:
            txt = txt + child.data
    return txt

def prependChild(parent,child):
    """

    Hack code ! @@@
    """
    parent.appendChild(child)
    tmp = parent.childNodes[0]
    n = len(parent.childNodes)
    parent.childNodes[0] = parent.childNodes[n-1]
    parent.childNodes[n-1] = tmp

class DOMProcedure:
    """
    DOMScan (HTML analyzer) interface.

    
    """
    def attr(self,node,attr,path):
        """
        
        Attribute node process.
        node   : parent node
        attr   : attribute node
        """
        return None

    def doctype(self,node,path):
        """
        
        Doctype node process.
        node   : doctype node
        """
        return None

    def pre_elem(self,node,path):
        """
        
        Element node begin process.
        node   : element node
        """
        return None

    def elem(self,node,path):
        """
        
        Element node process.
        node   : element node
        """
        return None

    def post_elem(self,node,path):
        """
        
        Element node end process.
        node   : element node
        """
        return None

    def text(self,parent,node,path):
        """
        
        Text node process.
        parent : parent node
        node   : text node
        """
        return None

    def cdata(self,parent,node,path):
        """
        
        Cdata node process.
        parent : parent node
        node   : Cdata node
        """
        return None

    def comment(self,parent,node,path):
        """
        
        Comment node process.
        parent : parent node
        node   : comment node
        """
        return None
    def other (self,node,path):
        """
        
        Ignored node
        parent : parent node
        node   : ignored node
        """
        return None


class DOMScan:
    """
    HTML analyzer.

    Recursive search  to elements by any conditions.
    node : target node
    name : attribute name
    """
    # mode definition
    MODE_FULL = 0
    MODE_ELEM = 1
    def __init__(self,proc,mode=0):
        """

        proc : procedure instance
        mode : MODE_FULL ex> Scan all node ,  MODE_ELEM ex> Scan element node (tag node) only
        """
        self.proc = proc
        self.mode = mode
        
    def execute(self,node):
        """

        Scan !
        node : target node.
        """
        return self.__disp(None,node,'')

    def __attr(self,node,path):
        for i in range(node.attributes.length):
            attr = node.attributes.item(i)
            ret = self.proc.attr(node,attr,path+'/@'+attr.name.lower())
            if ret != None:
                return ret
        return None

    def __child(self,node,path):
        for child in  node.childNodes:
            ret = self.__disp(node,child,path)
            if ret != None:
                return ret

    def __disp(self,parent,node,path):
        if node.nodeType == node.DOCUMENT_NODE:
            ret = self.__child(node,'/')
        elif self.mode != self.MODE_ELEM and node.nodeType == node.DOCUMENT_TYPE_NODE:
            ret = self.proc.doctype(node,'/')
        elif self.mode != self.MODE_ELEM and node.nodeType == node.TEXT_NODE:
            ret = self.proc.text(parent,node,path+'/text()')
        elif self.mode != self.MODE_ELEM and node.nodeType == node.CDATA_SECTION_NODE:
            ret = self.proc.cdata(parent,node,path+'/text()')
        elif node.nodeType == node.ELEMENT_NODE:
            ret = self.proc.pre_elem(node,path+'/'+node.tagName.lower())
            if ret != None:
                return ret
            if self.mode != self.MODE_ELEM and node.hasAttributes():
                ret = self.__attr(node,path+'/'+node.tagName.lower())
                if ret != None:
                    return ret
            ret = self.proc.elem(node,path+'/'+node.tagName.lower())
            if ret != None:
                return ret
            ret = self.__child(node,path+'/'+node.tagName.lower())
            ret = self.proc.post_elem(node,path+'/'+node.tagName.lower())
            if ret != None:
                return ret
        elif self.mode != self.MODE_ELEM and node.nodeType == node.COMMENT_NODE:
            ret = self.proc.comment(parent,node,path+'/comment()')
        else:
            ret = self.proc.other(node,path)
        return ret

#--------------------------------------------------------------------------------------

class DOMWriter(DOMProcedure):
    """
    Generate html frm node procidure.

    """
    def __init__(self):
        self.html=""
    def attr(self,node,attr,path):
        self.html = self.html + attr.name + ' = \"' + attr.nodeValue + '\" '
        return None

    def doctype(self,node,path):
        self.html = self.html + '<!DOCTYPE ' + node.name + ' PUBLIC \"' + to_unicode(node.publicId) + '\" \"' + to_unicode(node.systemId) + '\">'
        return None

    def pre_elem(self,node,path):
        self.html = self.html + '<' + node.tagName + ' '
        return None

    def elem(self,node,path):
        self.html = self.html + '>'
        return None

    def post_elem(self,node,path):
        self.html = self.html + '</' + node.tagName + '>'
        return None

    def text(self,parent,node,path):
        self.html = self.html + node.data
        return None

    def cdata(self,parent,node,path):
        self.html = self.html + '<![CDATA[' + node.data + ']]'
        return None

    def comment(self,parent,node,path):
        self.html = self.html + '<!--' + node.nodeValue + '-->'
        return None
    def other (self,node,path):
        raise WebMonitorError('DOMWriter => Unknownm node' + to_unicode(node))

def gen_html (node):
    """

    html dump.
    node : Target node
    """
    domwriter = DOMWriter()
    DOMScan(domwriter).execute(node)
    return domwriter.html


#--------------------------------------------------------------------------------------
class DOMDumper(DOMProcedure):
    """
    Node dump procidure. (for debug)

    """
    def attr(self,node,attr,path):
        print path + ' : ' + attr.name + ' => ' + attr.nodeValue
        return None

    def doctype(self,node,path):
        return None

    def elem(self,node,path):
        print path + ' : ' + node.tagName
        return None

    def text(self,parent,node,path):
        print path + ' : ' + node.data
        return None

    def cdata(self,parent,node,path):
        print path + ' : ' + node.data
        return None

    def comment(self,parent,node,path):
        return None
        
def dump_node (node):
    """

    Debug dump.
    node : Target node
    """
    return DOMScan(DOMDumper()).execute(node)

#--------------------------------------------------------------------------------------
class DOMXpath(DOMProcedure):
    """
    Node finds by xpath procedure.

    """
    def __init__(self,xpath,nodes):
        """

        xpath : Target node 
        nodes : Result nodes
        """
        self.xpath = xpath.lower()
        self.nodes = nodes

    def attr(self,node,attr,path):
        if  path == self.xpath:
            self.nodes.append(attr)
        return None

    def doctype(self,node,path):
        if  path == self.xpath:
            self.nodes.append(node)
        return None

    def elem(self,node,path):
        if  path == self.xpath:
            self.nodes.append(node)
        return None

    def text(self,parent,node,path):
        if  path == self.xpath:
            self.nodes.append(node)
        return None

    def cdata(self,parent,node,path):
        if  path == self.xpath:
            self.nodes.append(node)
        return None

    def comment(self,parent,node,path):
        if  path == self.xpath:
            self.nodes.append(node)
        return None

    def other (self,node,path):
        raise WebMonitorError('DOMXpath => Unknownm node' + to_unicode(node))

def xpath (xpath,node,ret=None):
    """

    xpath : Target element by xpath
    node  : Target node
    ret   : Return buffer ( when marge result)
    """
    if ret == None:
        ret = []
    finder = DOMXpath(xpath,ret)
    DOMScan(finder).execute(node)
    return ret

#--------------------------------------------------------------------------------------
class DOMFinder(DOMProcedure):
    """
    Node finds procedure.

    """
    def __init__(self,tag,attrs,txt,nodes):
        """

        tag   : Target element name
        attrs : Target attribute info {name : value,...}
              :   attribute value match everything if value is None
        txt   : Target text value
        nodes : Result nodes
        """
        self.tag = tag.upper()
        self.attrs = {}
        if attrs != None:
            for k,v in attrs.iteritems():
                if v != None:
                    self.attrs[k.upper()] = v.upper()
                else:
                    self.attrs[k.upper()] = None
        self.txt = txt
        self.nodes = nodes

    def elem(self,node,path):
        if node.tagName.upper() == self.tag:
            has_attr = node.hasAttributes()
            for a,v in self.attrs.iteritems():
                attr = getAttributeNode(node,a)
                if attr != None:
                    if attr.name.upper() == a and ( v == None or attr.nodeValue.upper() == v ):
                        continue
                return None
            if self.txt == None:
                self.nodes.append(node)
            else:
                txt = getNodeText(node)
                if  txt == self.txt:
                    self.nodes.append(node)
        return None

def find (tag,attrs,txt,node,ret=None):
    """

    tag   : Target element name
    attrs : Target attribute info {name : value,...}
          :   attribute value match everything if value is None
    txt   : Target text value
    node  : Target node
    ret   : Return buffer ( when marge result)
    """
    if ret == None:
        ret = []
    finder = DOMFinder(tag,attrs,txt,ret)
    DOMScan(finder,DOMScan.MODE_ELEM).execute(node)
    return ret


def get_charset(doc):
    """

    doc : Parsed html dom document
    """
    nodes = find('meta',{'content':None},None,doc)
    for node in nodes:
        content_node = getAttributeNode(node,'content')
        match = re.compile('charset\s*=\s*(\S+)[;]?').search(content_node.nodeValue.lower())
        if match != None:
            return (match.group(1),content_node)
    return ('utf-8',None)
    
def get_forms(req,doc):
    """

    Parse html to take out data of forms
    req  : urllib2.Request instance
    doc  : Target html document node
    """
    forms  = []
    charset,n = get_charset(doc)
    nodes = find('form',None,None,doc)
    for node in nodes:
        url = ""
        action_node = getAttributeNode(node,'action')
        if action_node != None:
            url = action_node.nodeValue
        form = WebMonitorForm(eval_relative_url(req.get_full_url(),url),charset)
        method_node = getAttributeNode(node,'method')
        if method_node != None:
            if method_node.nodeValue.upper() == 'POST':
                form.set_method_post()
        parms = {}
        # input tag
        inputs = find('input',None,None,node)
        for input in inputs:
            type_node = getAttributeNode(input,'type')
            if type_node != None:
                if type_node.nodeValue != 'submit':
                    name_node = getAttributeNode(input,'name')
                    value_node = getAttributeNode(input,'value')
                    if name_node != None and value_node != None:
                        parms[name_node.nodeValue] = value_node.nodeValue
        # select tag
        selects = find('select',None,None,node)
        for select in selects:
            name_node = getAttributeNode(select,'name')
            if name_node != None:
                value = '';
                options = find('option',{'selected':None},None,select)
                if options != []:
                    value_node = getAttributeNode(options[0],'value')
                    if value_node != None:
                        value = value_node.nodeValue
                parms[name_node.nodeValue] = value
        form.set_parms(parms)
        # textarea tag
        textareas = find('textarea',None,None,node)
        for textarea in textareas:
            name_node = getAttributeNode(textarea,'name')
            value_node = getAttributeNode(textarea,'value')
            if name_node != None and value_node != None:
                parms[name_node.nodeValue] = value_node.nodeValue
        # append
        forms.append(form)
    return forms

def get_href(req,doc,txt):
    """

    Parse html to take out data of <A> tags.
    req  : urllib2.Request instance
    doc  : Target html document node
    txt  : Search condetion ( <A> tag text )
    """
    forms  = []
    charset,n = get_charset(doc)
    nodes = find('a',None,txt,doc)
    for node in nodes:
        url = ""
        href_node = getAttributeNode(node,'href')
        if href_node != None:
            url = href_node.nodeValue
        if re.compile('^(?!http[s]?)\\w+:').match(url):
            continue
        form = WebMonitorForm(eval_relative_url(req.get_full_url(),url),charset)
        forms.append(form)
    return forms

def get_link(req,doc,rel):
    """

    Parse html to take out data of <LINK> tags.
    req  : urllib2.Request instance
    doc  : Target html document node
    rel  : Search condetion ( <link rel=???> rel attribute )
    """
    forms  = []
    charset,n = get_charset(doc)
    nodes = find('link',rel,None,doc)
    for node in nodes:
        url = ""
        href_node = getAttributeNode(node,'href')
        if href_node != None:
            url = href_node.nodeValue
        form = WebMonitorForm(eval_relative_url(req.get_full_url(),url),charset)
        forms.append(form)
    return forms

def get_script(req,doc):
    """

    Parse html to take out data of <SCRIPT> tags.
    req  : urllib2.Request instance
    doc  : Target html document node
    """
    forms  = []
    charset,n = get_charset(doc)
    nodes = find('script',None,None,doc)
    for node in nodes:
        url = ""
        src_node = getAttributeNode(node,'src')
        if src_node != None:
            url = src_node.nodeValue
        form = WebMonitorForm(eval_relative_url(req.get_full_url(),url),charset)
        forms.append(form)
    return forms

def get_img(req,doc):
    """

    Parse html to take out data of <IMG> tags.
    req  : urllib2.Request instance
    doc  : Target html document node
    """
    forms  = []
    charset,n = get_charset(doc)
    nodes = find('img',None,None,doc)
    for node in nodes:
        url = ""
        src_node = getAttributeNode(node,'src')
        if src_node != None:
            url = src_node.nodeValue
        form = WebMonitorForm(eval_relative_url(req.get_full_url(),url),charset)
        forms.append(form)
    return forms

def get_doc(html):
    """

    Parse html to take out data of <A> tags.
    req  : urllib2.Request instance
    html : Target html string
    txt  : Search condetion ( <A> tag text )
    """
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    try:
        doc = parser.parse(html)
    except:
        raise WebMonitorError('Html parse error ! \n' + html)
    return doc

def get_title(doc):
    """

    Page title
    doc : Parsed html dom document
    """
    title_nodes = find('title',None,None,doc)
    if len(title_nodes) > 0:
        title = getNodeText(title_nodes[0])
        return to_unicode(title)
    return None
