#-*- coding: utf-8 -*-
'''
@author: Miguel Hermo Serans
'''
from miopia.util.BinaryTree import BinaryTree
import codecs
import os
#from unidecode import unidecode

class PsychometricDictionary():
    """
    This class is responsible of handling psychometric dictionaries which follow the LIWC format
    """
    _categories = []
    _words = {}
    _root = None
    _regexp_keys_tree = None


    def _generate_tree(self):
        """
        Create a binary tree for an efficient search of the regexp included in those kind
        of dictionaries
        """
        regexp_keys_tree = BinaryTree()
        self._root = None
        for key in self._words['REGEXP'].keys():
            if self._root != None:
                regexp_keys_tree.insert(self._root, key, self._words['REGEXP'].get(key))
            else:
                self._root = regexp_keys_tree.addNode(key, self._words['REGEXP'].get(key))
        self._regexp_keys_tree = regexp_keys_tree
    
    
    
    def get_psychometric_categories(self,str_word):
        """
        @param str_word: A string. A word.
        @return: A list with the psychometric categories of str_word.
        """

        if str_word is None:
            return []
        else:
            str_word = str_word.lower()
            if self._words.has_key(str_word):
                return [self._categories[category] 
                        for category in self._words[str_word]]
            else:      
                list_id_phsychometrics = self._regexp_keys_tree.search_longest_match(self._root,str_word)
                try:
                    return [self._categories[category]
                            for category in list_id_phsychometrics]                 
                except TypeError:
                    return []
    
    def generateDictionary(self, categories, words, ascii_only=True):
        """
        It generates the psychometric dictionary
        @param categories: dictionary of categories, Dictionary<id_category,category_name>
        @param words: dictionary of words, Dictionary<word, List<id_category>>. Wildcard "*" is supported
        @param ascci_only: if true, all unicode characters will be converted to the closest ascii representation
        """
        _categories = {}
        _words = {}
        dict_regexp_psychometric_words = {}
                    
        #Psychometric categories
        for id_cat in categories:
            _categories[id_cat] = unicode(categories[id_cat]) if ascii_only else categories[id_cat]
            #_categories[id_cat] = unidecode(categories[id_cat]) if ascii_only else categories[id_cat]
        
        #Psychometric words
        for key in words:
            plain_word = unicode(key) if ascii_only else key
            #plain_word = unidecode(key) if ascii_only else key
            if not key.endswith('*'):
                if _words.has_key(plain_word):
                    _words[plain_word] = [w for w in _words[plain_word] if w in words[key]]
                else:
                    _words[plain_word] = words[key]
            else:
                #Regex patterns
                dict_regexp_psychometric_words[plain_word[:-1]] = words[key]
        
        self._words = _words
        self._categories = _categories
        self._words['REGEXP'] = dict_regexp_psychometric_words
        self._generate_tree()       
    
    def readFromFile(self,filename=None, encoding='utf-8'):
        """
        Reads a psychometric dictionary from a LWIC-formatted file 
        @param filename: None to use the path included at the configuration file.
        @param encoding: Encoding of the dictionary
        """
        if filename==None or not os.path.exists(filename): categories,words = {},{}
        else:
            str_psycho_dict = codecs.open(filename,encoding=encoding).read()
            str_categories = str_psycho_dict.split('%')[1].replace('\r','').split('\n')
            str_words = str_psycho_dict.split('%')[2].replace('\r','').split('\n')
            
            categories = { int(c[0]):c[1] for c in [l.split("\t") for l in str_categories if l!="" ]}
            words = { w[0]:map(int,w[1:]) for w in [l.split("\t") for l in str_words if l!="" ]}
        
        self.generateDictionary( categories, words)