1
2 '''
3 @author: Miguel Hermo Serans
4 '''
5 from miopia.util.BinaryTree import BinaryTree
6 import codecs
7 import os
8
9
11 """
12 This class is responsible of handling psychometric dictionaries which follow the LIWC format
13 """
14 _categories = []
15 _words = {}
16 _root = None
17 _regexp_keys_tree = None
18
19
21 """
22 Create a binary tree for an efficient search of the regexp included in those kind
23 of dictionaries
24 """
25 regexp_keys_tree = BinaryTree()
26 self._root = None
27 for key in self._words['REGEXP'].keys():
28 if self._root != None:
29 regexp_keys_tree.insert(self._root, key, self._words['REGEXP'].get(key))
30 else:
31 self._root = regexp_keys_tree.addNode(key, self._words['REGEXP'].get(key))
32 self._regexp_keys_tree = regexp_keys_tree
33
34
35
37 """
38 @param str_word: A string. A word.
39 @return: A list with the psychometric categories of str_word.
40 """
41
42 if str_word is None:
43 return []
44 else:
45 str_word = str_word.lower()
46 if self._words.has_key(str_word):
47 return [self._categories[category]
48 for category in self._words[str_word]]
49 else:
50 list_id_phsychometrics = self._regexp_keys_tree.search_longest_match(self._root,str_word)
51 try:
52 return [self._categories[category]
53 for category in list_id_phsychometrics]
54 except TypeError:
55 return []
56
58 """
59 It generates the psychometric dictionary
60 @param categories: dictionary of categories, Dictionary<id_category,category_name>
61 @param words: dictionary of words, Dictionary<word, List<id_category>>. Wildcard "*" is supported
62 @param ascci_only: if true, all unicode characters will be converted to the closest ascii representation
63 """
64 _categories = {}
65 _words = {}
66 dict_regexp_psychometric_words = {}
67
68
69 for id_cat in categories:
70 _categories[id_cat] = unicode(categories[id_cat]) if ascii_only else categories[id_cat]
71
72
73
74 for key in words:
75 plain_word = unicode(key) if ascii_only else key
76
77 if not key.endswith('*'):
78 if _words.has_key(plain_word):
79 _words[plain_word] = [w for w in _words[plain_word] if w in words[key]]
80 else:
81 _words[plain_word] = words[key]
82 else:
83
84 dict_regexp_psychometric_words[plain_word[:-1]] = words[key]
85
86 self._words = _words
87 self._categories = _categories
88 self._words['REGEXP'] = dict_regexp_psychometric_words
89 self._generate_tree()
90
92 """
93 Reads a psychometric dictionary from a LWIC-formatted file
94 @param filename: None to use the path included at the configuration file.
95 @param encoding: Encoding of the dictionary
96 """
97 if filename==None or not os.path.exists(filename): categories,words = {},{}
98 else:
99 str_psycho_dict = codecs.open(filename,encoding=encoding).read()
100 str_categories = str_psycho_dict.split('%')[1].replace('\r','').split('\n')
101 str_words = str_psycho_dict.split('%')[2].replace('\r','').split('\n')
102
103 categories = { int(c[0]):c[1] for c in [l.split("\t") for l in str_categories if l!="" ]}
104 words = { w[0]:map(int,w[1:]) for w in [l.split("\t") for l in str_words if l!="" ]}
105
106 self.generateDictionary( categories, words)
107