#-*- coding: utf-8 -*-
'''
@author: David Vilares Calvo
'''
from collections import defaultdict
from miopia.util.ConfigurationManager import ConfigurationManager
from miopia.util.exceptions.UnknownSOException import *
from miopia.analyzer.SemanticCategory import SemanticCategory
from miopia.analyzer.PsychometricDictionary import PsychometricDictionary
import codecs
import os
import math
import warnings 

class Dictionary(object):
    '''
    A class for obtaining the semantic resources provided with MIOPIA
    '''
    __uniqueInstance = None   
       
    def __init__(self, noun={}, adj={}, adv={}, verb={}, intd={}, 
                 lemmas={}, 
                 p_dict=None):
        '''
        @param noun: nouns
        @param adj: adjectives
        @param adv: adverbs
        @param verb: verbs
        @param int: intensifiers
        @param lemmas: lemmas
        @param p_dict: A L{PsychometricDictionary}
        '''
        
        if(noun == adj ==  adv == verb == intd == lemmas == {} and p_dict == None):
            warnings.warn("deprecated: dictionaries should be passed as arguments",DeprecationWarning)
        
            encoding_dict = "ISO-8859-15"            
            path = ConfigurationManager(lang='es').getParameter("path_SODictionaries")
            self._noun_dict = self._get_words(path+"/ciao+SD_noun_dict_spa.txt",encoding_dict)
            self._adj_dict  = self._get_words(path+"/ciao+SD_adj_dict_spa.txt",encoding_dict)
            self._adv_dict  = self._get_words(path+"/ciao+SD_adv_dict_spa.txt",encoding_dict)
            self._verb_dict = self._get_words(path+"/ciao+SD_verb_dict_spa.txt",encoding_dict)
            self._int_dict  = self._get_words(path+"/int_dict_spa.txt",encoding_dict) 
            path = ConfigurationManager().getParameter("path_lemmas")
            self._lemmas = self._build_lemmas_dictionary(path)

            #The psychometric dictionary
            p_dict = PsychometricDictionary()
            p_dict.readFromFile(ConfigurationManager().getParameter("path_lwic_dictionary"))
            self._psychometric_dict = p_dict
        else:
            self._noun_dict = noun
            self._adj_dict  = adj
            self._adv_dict  = adv
            self._verb_dict = verb
            self._int_dict  = intd 
            self._lemmas = lemmas #self._build_lemmas_dictionary(path)
            if(p_dict != None):
                self._psychometric_dict = p_dict

        self._domain_dict = None
        self._neg_pond = 0
        
    def _get_words(self,name_of_dict,coding="utf-8"):
        """
        @param name_of_dict: The path to a semantic orientation dictionary
        @return: A dictionary {word,semantic orientation value}
        @deprecated: dictionaries should be passed as arguments to the constructor
        """
        words = codecs.open(name_of_dict,encoding=coding).readlines()
        D = {}
        for word in words:
            columns = word.split()
            if len(columns) == 2:
                if not D.has_key(columns[0]) and columns[0] != '':
                    D[columns[0]] = float(columns[1])
        return D

    def _build_lemmas_dictionary(self,lemmas_file):
        """
        Build a nested dictionary D[lexical_category][token] to get lemmas
        @param lemmas_file: A path to the lemmas dict file
        @precondition: lemmas dict must has this format: CrossTag\tToken\tLemma
        @return: A nested dictionary with the lemmas
        """

        words = codecs.open(lemmas_file,encoding="utf-8").readlines()
        
        D = defaultdict(defaultdict)
        for word in words:
            columns = word.split('\t')
            if len(columns) == 3:
                D[columns[0]][columns[1]] = columns[2][0:len(columns[2])-1]
        return D  
        
    def adapt(self,dict_domain_info,p,n,neg_pond=0,threshold=0.5,minimum_ocurrences=1):
        """
        @param dict_domain_info: A dictionary returned by the L{DomainAdaptor}. 
        The attribute form is the key. The value is a tuple 
        (avg_position_in_raking, ocurrences_in_pos_files, ocurrences_in_negatives_files).
        @param p: An integer between 0 and 1. The proportion of positive words included in the dictionary.
        @param n: An integer between 0 and 1. The proportion of negative words included in the dictionary.
        @param neg_pond: Weighting for negative words.
        @param threshold:
        @param minimum_ocurrences:
        """
        
#        def m2(atr_pos,atr_neg,max_pos_dif, min_neg_dif, neg_ponderation): 
#            new_end = 5.0
#            relax  = 1.0
#            
#            if atr_pos > atr_neg:
#                return  (( math.log( (atr_pos + relax) / (atr_neg + relax) ,2)) 
#                         / max_pos_dif )*new_end
#            else:
#                return  (( math.log( (atr_neg + relax) / (atr_pos + relax) ,2))
#                          / min_neg_dif)*-(new_end*neg_ponderation)

        self._neg_pond = neg_pond
        def m2(atr_pos,atr_neg,max_pos_dif, min_neg_dif, neg_ponderation): 
            new_end = 5.0
            relax  = 1.0
            
            if atr_pos > atr_neg:
                return  (( math.log( (atr_pos + relax) / (atr_neg + relax) ,2)) 
                         / max_pos_dif )*new_end
            else:
                return  (( math.log( (atr_neg + relax) / (atr_pos + relax) ,2))
                          / min_neg_dif)*-(new_end)

        
        
        def get_max_diffs_log2(dict_atr):
            pos_max_value = 0.0
            neg_max_value = 0.0
            relax  = 1.0
            
            for atr in dict_atr.keys():
                    pos_max_value = max(pos_max_value, math.log(float(dict_atr[atr][1] +relax) / 
                                                            float(dict_atr[atr][2] + relax),2))
                    neg_max_value = max(neg_max_value, math.log(float(dict_atr[atr][2] +relax) / 
                                                            float(dict_atr[atr][1] + relax),2))
            return (pos_max_value,neg_max_value)
        
              
        total_pos = 0
        total_neg = 0
        dict_clear_polarity_attributes = {}
        (pos_dif,neg_dif) = get_max_diffs_log2(dict_domain_info)
        for atr in dict_domain_info.keys():
            os = m2(dict_domain_info[atr][1],dict_domain_info[atr][2],
                    pos_dif,neg_dif,neg_pond) 
            if (abs(os) >= threshold and 
                dict_domain_info[atr][1]+dict_domain_info[atr][2] >= minimum_ocurrences):
                if os < 0 :
                    total_neg += 1
                if os > 0:
                    total_pos +=1
                dict_clear_polarity_attributes[atr] = os
        
        list_clear_polarity_attributes = sorted(dict_clear_polarity_attributes,
                                                key = dict_domain_info.get)
        p_atr,n_atr = total_pos*p, total_neg*n
        p_included,n_included = 0,0  
        self._domain_dict ={} 
        for atr in list_clear_polarity_attributes:
            os = dict_clear_polarity_attributes[atr] 
            if os > 0 and p_included < p_atr:
                self._domain_dict[atr] = os
                p_included+=1
            if os < 0 and n_included < n_atr:
                self._domain_dict[atr] = os
                n_included+=1
            
            if p_included >= p_atr and n_included >= n_atr:
                break
#        fwrite = open("/tmp/domain_dictSEEDICT.txt","w")
#        for key in self._domain_dict.keys():
#            fwrite.write(key+"\t"+str(self._domain_dict[key])+"\n")
    
    def is_intensifier_term(self,lemma):
        return lemma in self._int_dict.keys()
    

    def get_semantic_orientation(self,lemma,semantic_category):
        """
        @param semantic_category: A value in the collection {'n','a','v','r','i'}, 'n' is a noun, 'a' is an adjetive, 'v' is a verb, 'r' is an adverb and 'i' is an intensifier.
        @raise UnknownSOException: Raises this exception if term hasn't semantic orientation
        @return: Semantic orientation of a word
        """

        so = 0 #Semantic orientation of a word
                     
        if semantic_category == SemanticCategory.ADJECTIVE and self._adj_dict.has_key(lemma):
            so = self._adj_dict.get(lemma)       
        if semantic_category == SemanticCategory.NOUN and self._noun_dict.has_key(lemma):
            so = self._noun_dict.get(lemma)      
        if semantic_category == SemanticCategory.VERB and self._verb_dict.has_key(lemma):
            so = self._verb_dict.get(lemma)      
        if semantic_category == SemanticCategory.ADVERB and self._adv_dict.has_key(lemma):
            so = self._adv_dict.get(lemma)
        if semantic_category == SemanticCategory.INTENSIFIER and self._int_dict.has_key(lemma):
            so = self._int_dict.get(lemma)

        if (self._domain_dict is not None and
            self._domain_dict.has_key(lemma) and
            self._domain_dict.get(lemma) and 
            semantic_category in [SemanticCategory.NOUN, SemanticCategory.ADJECTIVE,
                                  SemanticCategory.VERB, SemanticCategory.ADVERB]):
                so = self._domain_dict.get(lemma)
        
        
        if so == 0:
            raise UnknownSOException("Term hasn't semantic orientation")
        else:
            return so*(1 + self._neg_pond*(so<0))
    
    
    
    def get_psychometric_categories(self,str_word):
        return self._psychometric_dict.get_psychometric_categories(str_word)
   
    def get_lemma(self,lexical_category,token):
        """
        @param token: A token
        @param lexical_category: The lexical category of the token
        @return: The lemma of the token
        """
        try:
            return self._lemmas[lexical_category][token]
        except:           
            try:
                return self._lemmas[lexical_category][token.lower()]
            except:
                try:
                    return self._heuristic_lemma(lexical_category, token)
                except:
                    #XXX: In SFU None was returned
                    #return None
                    return token
     
    def _heuristic_lemma(self,lexical_category, word):
        """
        @param lexical_category: The lexical category of the word
        @param word: A token
        @raise KeyError: If processed token isn't at lemmas dictionary
        @return: A possible lemma of the token
        """       
        if lexical_category in ['a']:
            if word.endswith("s"):
                return self._lemmas[lexical_category][word[0:len(word)-1]]

            if word.endswith("a"):
                try:
                    return self._lemmas[lexical_category][word[0:len(word)-1]+"o"]
                except:
                    if word.endswith("ita"):
                        return self._lemmas[lexical_category][word[0:len(word)-3]+"a"]
                    else:
                        raise Exception
                    
        if lexical_category == 'v':
            if word.endswith("se") or word.endswith("le") or word.endswith("me") or word.endswith("lo") or word.endswith("la"): 
                return self._lemmas[lexical_category][word[0:len(word)-2]]
            if word.endswith("nos") or word.endswith("les"):
                return self._lemmas[lexical_category][word[0:len(word)-3]]
            
        if lexical_category == 'n':
            if word.endswith("ita"):
               
                if word.endswith("quita"):
                    return self._lemmas[lexical_category][word[0:len(word)-5]+"ca"]
                elif word.endswith("ecita"):
                    return self._lemmas[lexical_category][word[0:len(word)-5]+"a"]
                else:
                    return self._lemmas[lexical_category][word[0:len(word)-3]+"a"]
                
            if word.endswith("ito"):                   
                if word.endswith("quito"):
                    return self._lemmas[lexical_category][word[0:len(word)-5]+"co"]
                elif word.endswith("cito"):
                    return self._lemmas[lexical_category][word[0:len(word)-4]]
                else:
                    return self._lemmas[lexical_category][word[0:len(word)-3]+"o"]
            
        return  self._lemmas[lexical_category][word]

