'''
@author: David Vilares Calvo
'''

import re
import codecs
from miopia.preparator.LexicalSentimentInfo import LexicalSentimentInfo, LexicalValenceShifter
from miopia.util.ConfigurationManager import ConfigurationManager
import xml.etree.ElementTree as ET


class LexicalProcessor(object):
    '''
    classdocs
    '''

    def __init__(self, sentence_tokenizer, tokenizer, tagger, 
                 vocabulary_set=[]):
        '''
        Constructor
        @param sentence_tokenizer: Use tokenizers/punkt/spanish.pickle from nltk_data
        @param tokenizer: An instance of L{nltk.tokenize.punkt import PunktWordTokenizer}
        @param tagger: Use spanish_brill.pickle (after unserialize) included in this package
        @param vocabulary_set: A Python set with the vocabulary
        '''
        self._sentence_tokenizer = sentence_tokenizer
        self._tokenizer = tokenizer
        self._tagger = tagger
        self._vocabulary_set = vocabulary_set
        '''
        if vocabulary_set is None:
            self._vocabulary_set = self._build_vocabulary_set(ConfigurationManager().getParameter("path_vocabulary_set"))
        else:
            self._vocabulary_set = vocabulary_set
        '''
    
    
    def get_vocabulary_set(self):
        return self._vocabulary_set
    
    
    def _build_vocabulary_set(self,path_to_file):
        """
        @param path_to_file: A path to the file with the vocabulary of words.
        
        Example of the structure of a vocabulary file:
        Word1
        Word2
        ...
        WordN
        """
        return set( codecs.open(path_to_file,encoding='utf-8').read().split())
        
        
    
    def extract_sentences(self,text):
        """
        @param text: A String
        @return: A list of strings with the sentences of the text
        """
        return self._sentence_tokenizer.tokenize(text)
   
     
    
    def _replications(self,token):
        """
        @param token: A String
        @return: A list with the repeated chars in the token
        """
        #Numbers can have replicated digits but there arent't 'replications' 
        try:
            float(token)
            return []
        except:
            matcher = re.compile(r'(.)\1*')
            replications =  [match.group() for match in matcher.finditer(token)
                             if len(match.group())>=2]
            return replications
                      


    def _eliminate_replications(self, token, replications):
        """
        @param token: A String
        @param replications: A list with the replicated chars of a token
        @return: A valid word in the vocabulary if it exits, otherwise returns
        the original word.
        """
        combinations = []
        c = ""
        
        if token not in self._vocabulary_set and token.lower() not in self._vocabulary_set:       
            #posible combinations  e.g. (xx,yy) (xx,y) (x,yy)
            for r in replications:
                #range(1,3) in spanish only two repeated chars are supported.
                for i in reversed(range(1,3)):
                    c = token.replace(r,r[:i])  
                    #lower used to match first letter capitalised words
                    if (c in self._vocabulary_set 
                        or c.lower() in self._vocabulary_set):
                        combinations.append(c)
                    for r2 in replications:
                        if r2 is not r:
                            for j in reversed(range(1,3)):
                                if ((c.replace(r2,r2[:j]) in self._vocabulary_set 
                                or c.replace(r2,r2[:j]).lower() in self._vocabulary_set) 
                                and c.replace(r2,r2[:j]) not in combinations):
                                    combinations.append(c.replace(r2,r2[:j]))
        if combinations == []:
            return token
        else:
            return sorted(combinations,key=len)[0]
                            
    
    def _is_upper_intesification(self,token):
        """
        @param token: A String
        @return: True if words is a complete capitalized word, False otherwise
        """
        #Not considering symbols like ',' '.' or similars, we only take words
        return (token.isupper() and token not in self._vocabulary_set)
    
    
    
    def _is_intensifier_replication(self,replications):
        """
        @param replications: A list with the replicated chars of a token
        @return: True if there are three or more replicated chars, False otherwise
        """
        longest_replication = sorted(replications,key=len,reverse=True)[0] 
        return ( len(longest_replication) >= 3 and
                 longest_replication != '...') #It is a special case
       
       
          
    
    def extract_tokens(self,sentences):
        """
        @sentences: A list of the sentences of a given text
        @return: A list of lists with the token tokens and
        a L{LexicalSentimentInfo} instance with the lexical
        sentiment information for the text. L{LexicalSentimentInfo}
        is None If no lexical sentiment info is found.
        """
        lsi = None 
        sentences_tokenized = []
        processed_tokens = []
        id_sentence = 1
        
        for s in sentences:
            tokens = self._tokenizer.tokenize(s)
            id_token = 1
            for t in tokens:
                
                if self._is_upper_intesification(t):
                    if lsi is None: lsi = LexicalSentimentInfo()
                    t = t.lower()
                    lsi.add_lexical_valence_shifter(id_sentence,
                                                    id_token,
                                                    LexicalValenceShifter.CAPS)
                                   
                replications = self._replications(t)
                if replications != []:
                    #Not considering as intensifying the case of 2 replicated chars?
                    normalized_t = self._eliminate_replications(t, replications)
                    t = normalized_t
                    if self._is_intensifier_replication(replications):
                        if lsi is None: lsi = LexicalSentimentInfo()
                        lsi.add_lexical_valence_shifter(id_sentence, 
                                                        id_token,            
                                                        LexicalValenceShifter.REPLICATION) 
                id_token += 1
                processed_tokens.append(t)
            sentences_tokenized.append(processed_tokens)
            processed_tokens =[]
            id_sentence += 1
        return (sentences_tokenized, lsi)
        

        
    def extract_tags(self,tokenized_sentences):  
        """
        @param tokenized_sentences: A list of lists of tokens
        @return A list of tagged sentences. Each tagged sentence is a list of tuples (token,L{INfoTag})
        """
        return [self._tagger.tag(tokenized_sentence) for tokenized_sentence in tokenized_sentences] 
 
 
 
    def create_lexical_info_XML(self,dict_of_lsi,path_dest):
        """
        Writes in path_dest a XML representation of the L{LexicalSentimentInfo}
        of the file
        @param path_dest: A path to the destination XML file
        @param dict_of_lsi: A dictionary of L{LexicalSentimentInfo}
        """
        
        def create_element(parent,element_name,text):
            element = ET.SubElement(parent, element_name)
            element.text = text       
            return element


        files = ET.Element('files')   
        for lsi_file in dict_of_lsi.keys():
            if dict_of_lsi[lsi_file] is not None:
                file_element = ET.SubElement(files, 'file')
                create_element(file_element, 'fileid',lsi_file)
                sentences = ET.SubElement(file_element, 'sentences')       
                for sentence_key in dict_of_lsi[lsi_file].get_dict():
                    sentence = ET.SubElement(sentences,'sentence')
                    create_element(sentence,'sentenceid',str(sentence_key))
                    words = ET.SubElement(sentence,'words')
                    word_keys = sentence_key, dict_of_lsi[lsi_file].get_dict()[sentence_key]
                    for item in word_keys[1].items():
                        word = ET.SubElement(words,'word')
                        create_element(word,'wordid',str(item[0]))
                        create_element(word,'phenomena',','.join(item[1]))  
        f = open(path_dest,'w')   
        ET.ElementTree(files).write(path_dest, 'UTF-8')
        f.close()

    
    
    def read_lexical_info_XML(self,input_path):
        """
        @param input_path: A path to A XML file with the lexical sentiment 
        info needed to build a dict of L{LexicalSentimentInfo}
        @return: A dictionary of L{LexicalSentimentInfo}
        """
        tree = ET.parse(input_path)
        root = tree.getroot()                   
        dict_of_lsi = {}
        files = root.findall('file')
        
        switch = {'CAPS': LexicalValenceShifter.CAPS,
                  'REPLICATION': LexicalValenceShifter.REPLICATION}
        
        for f in files:
            sentences = f.find('sentences').findall('sentence')
            lsi = LexicalSentimentInfo()
            for s in sentences:
                words = s.find('words').findall('word')
                for w in words:
                    phenomena = w.find('phenomena').text.split(',')               
                    for phenomenon in phenomena:                                      
                        lsi.add_lexical_valence_shifter(int(s.find('sentenceid').text), 
                                                    int(w.find('wordid').text), 
                                                    switch[phenomenon]) 
            dict_of_lsi[f.find('fileid').text] = lsi
        return dict_of_lsi
            