Package miopia :: Package analyzer :: Module PsychometricDictionary
[hide private]
[frames] | no frames]

Source Code for Module miopia.analyzer.PsychometricDictionary

  1  #-*- coding: utf-8 -*- 
  2  ''' 
  3  @author: Miguel Hermo Serans 
  4  ''' 
  5  from miopia.util.BinaryTree import BinaryTree 
  6  import codecs 
  7  import os 
  8  #from unidecode import unidecode 
  9   
10 -class PsychometricDictionary():
11 """ 12 This class is responsible of handling psychometric dictionaries which follow the LIWC format 13 """ 14 _categories = [] 15 _words = {} 16 _root = None 17 _regexp_keys_tree = None 18 19
20 - def _generate_tree(self):
21 """ 22 Create a binary tree for an efficient search of the regexp included in those kind 23 of dictionaries 24 """ 25 regexp_keys_tree = BinaryTree() 26 self._root = None 27 for key in self._words['REGEXP'].keys(): 28 if self._root != None: 29 regexp_keys_tree.insert(self._root, key, self._words['REGEXP'].get(key)) 30 else: 31 self._root = regexp_keys_tree.addNode(key, self._words['REGEXP'].get(key)) 32 self._regexp_keys_tree = regexp_keys_tree
33 34 35
36 - def get_psychometric_categories(self,str_word):
37 """ 38 @param str_word: A string. A word. 39 @return: A list with the psychometric categories of str_word. 40 """ 41 42 if str_word is None: 43 return [] 44 else: 45 str_word = str_word.lower() 46 if self._words.has_key(str_word): 47 return [self._categories[category] 48 for category in self._words[str_word]] 49 else: 50 list_id_phsychometrics = self._regexp_keys_tree.search_longest_match(self._root,str_word) 51 try: 52 return [self._categories[category] 53 for category in list_id_phsychometrics] 54 except TypeError: 55 return []
56
57 - def generateDictionary(self, categories, words, ascii_only=True):
58 """ 59 It generates the psychometric dictionary 60 @param categories: dictionary of categories, Dictionary<id_category,category_name> 61 @param words: dictionary of words, Dictionary<word, List<id_category>>. Wildcard "*" is supported 62 @param ascci_only: if true, all unicode characters will be converted to the closest ascii representation 63 """ 64 _categories = {} 65 _words = {} 66 dict_regexp_psychometric_words = {} 67 68 #Psychometric categories 69 for id_cat in categories: 70 _categories[id_cat] = unicode(categories[id_cat]) if ascii_only else categories[id_cat] 71 #_categories[id_cat] = unidecode(categories[id_cat]) if ascii_only else categories[id_cat] 72 73 #Psychometric words 74 for key in words: 75 plain_word = unicode(key) if ascii_only else key 76 #plain_word = unidecode(key) if ascii_only else key 77 if not key.endswith('*'): 78 if _words.has_key(plain_word): 79 _words[plain_word] = [w for w in _words[plain_word] if w in words[key]] 80 else: 81 _words[plain_word] = words[key] 82 else: 83 #Regex patterns 84 dict_regexp_psychometric_words[plain_word[:-1]] = words[key] 85 86 self._words = _words 87 self._categories = _categories 88 self._words['REGEXP'] = dict_regexp_psychometric_words 89 self._generate_tree()
90
91 - def readFromFile(self,filename=None, encoding='utf-8'):
92 """ 93 Reads a psychometric dictionary from a LWIC-formatted file 94 @param filename: None to use the path included at the configuration file. 95 @param encoding: Encoding of the dictionary 96 """ 97 if filename==None or not os.path.exists(filename): categories,words = {},{} 98 else: 99 str_psycho_dict = codecs.open(filename,encoding=encoding).read() 100 str_categories = str_psycho_dict.split('%')[1].replace('\r','').split('\n') 101 str_words = str_psycho_dict.split('%')[2].replace('\r','').split('\n') 102 103 categories = { int(c[0]):c[1] for c in [l.split("\t") for l in str_categories if l!="" ]} 104 words = { w[0]:map(int,w[1:]) for w in [l.split("\t") for l in str_words if l!="" ]} 105 106 self.generateDictionary( categories, words)
107