1
2 '''
3 @author: David Vilares Calvo
4 '''
5 from collections import defaultdict
6 from miopia.util.ConfigurationManager import ConfigurationManager
7 from miopia.util.exceptions.UnknownSOException import *
8 from miopia.analyzer.SemanticCategory import SemanticCategory
9 from miopia.analyzer.PsychometricDictionary import PsychometricDictionary
10 import codecs
11 import os
12 import math
13 import warnings
14
16 '''
17 A class for obtaining the semantic resources provided with MIOPIA
18 '''
19 __uniqueInstance = None
20
21 - def __init__(self, noun={}, adj={}, adv={}, verb={}, intd={},
22 lemmas={},
23 p_dict=None):
24 '''
25 @param noun: nouns
26 @param adj: adjectives
27 @param adv: adverbs
28 @param verb: verbs
29 @param int: intensifiers
30 @param lemmas: lemmas
31 @param p_dict: A L{PsychometricDictionary}
32 '''
33
34 if(noun == adj == adv == verb == intd == lemmas == {} and p_dict == None):
35 warnings.warn("deprecated: dictionaries should be passed as arguments",DeprecationWarning)
36
37 encoding_dict = "ISO-8859-15"
38 path = ConfigurationManager(lang='es').getParameter("path_SODictionaries")
39 self._noun_dict = self._get_words(path+"/ciao+SD_noun_dict_spa.txt",encoding_dict)
40 self._adj_dict = self._get_words(path+"/ciao+SD_adj_dict_spa.txt",encoding_dict)
41 self._adv_dict = self._get_words(path+"/ciao+SD_adv_dict_spa.txt",encoding_dict)
42 self._verb_dict = self._get_words(path+"/ciao+SD_verb_dict_spa.txt",encoding_dict)
43 self._int_dict = self._get_words(path+"/int_dict_spa.txt",encoding_dict)
44 path = ConfigurationManager().getParameter("path_lemmas")
45 self._lemmas = self._build_lemmas_dictionary(path)
46
47
48 p_dict = PsychometricDictionary()
49 p_dict.readFromFile(ConfigurationManager().getParameter("path_lwic_dictionary"))
50 self._psychometric_dict = p_dict
51 else:
52 self._noun_dict = noun
53 self._adj_dict = adj
54 self._adv_dict = adv
55 self._verb_dict = verb
56 self._int_dict = intd
57 self._lemmas = lemmas
58 if(p_dict != None):
59 self._psychometric_dict = p_dict
60
61 self._domain_dict = None
62 self._neg_pond = 0
63
65 """
66 @param name_of_dict: The path to a semantic orientation dictionary
67 @return: A dictionary {word,semantic orientation value}
68 @deprecated: dictionaries should be passed as arguments to the constructor
69 """
70 words = codecs.open(name_of_dict,encoding=coding).readlines()
71 D = {}
72 for word in words:
73 columns = word.split()
74 if len(columns) == 2:
75 if not D.has_key(columns[0]) and columns[0] != '':
76 D[columns[0]] = float(columns[1])
77 return D
78
80 """
81 Build a nested dictionary D[lexical_category][token] to get lemmas
82 @param lemmas_file: A path to the lemmas dict file
83 @precondition: lemmas dict must has this format: CrossTag\tToken\tLemma
84 @return: A nested dictionary with the lemmas
85 """
86
87 words = codecs.open(lemmas_file,encoding="utf-8").readlines()
88
89 D = defaultdict(defaultdict)
90 for word in words:
91 columns = word.split('\t')
92 if len(columns) == 3:
93 D[columns[0]][columns[1]] = columns[2][0:len(columns[2])-1]
94 return D
95
96 - def adapt(self,dict_domain_info,p,n,neg_pond=0,threshold=0.5,minimum_ocurrences=1):
97 """
98 @param dict_domain_info: A dictionary returned by the L{DomainAdaptor}.
99 The attribute form is the key. The value is a tuple
100 (avg_position_in_raking, ocurrences_in_pos_files, ocurrences_in_negatives_files).
101 @param p: An integer between 0 and 1. The proportion of positive words included in the dictionary.
102 @param n: An integer between 0 and 1. The proportion of negative words included in the dictionary.
103 @param neg_pond: Weighting for negative words.
104 @param threshold:
105 @param minimum_ocurrences:
106 """
107
108
109
110
111
112
113
114
115
116
117
118
119 self._neg_pond = neg_pond
120 def m2(atr_pos,atr_neg,max_pos_dif, min_neg_dif, neg_ponderation):
121 new_end = 5.0
122 relax = 1.0
123
124 if atr_pos > atr_neg:
125 return (( math.log( (atr_pos + relax) / (atr_neg + relax) ,2))
126 / max_pos_dif )*new_end
127 else:
128 return (( math.log( (atr_neg + relax) / (atr_pos + relax) ,2))
129 / min_neg_dif)*-(new_end)
130
131
132
133 def get_max_diffs_log2(dict_atr):
134 pos_max_value = 0.0
135 neg_max_value = 0.0
136 relax = 1.0
137
138 for atr in dict_atr.keys():
139 pos_max_value = max(pos_max_value, math.log(float(dict_atr[atr][1] +relax) /
140 float(dict_atr[atr][2] + relax),2))
141 neg_max_value = max(neg_max_value, math.log(float(dict_atr[atr][2] +relax) /
142 float(dict_atr[atr][1] + relax),2))
143 return (pos_max_value,neg_max_value)
144
145
146 total_pos = 0
147 total_neg = 0
148 dict_clear_polarity_attributes = {}
149 (pos_dif,neg_dif) = get_max_diffs_log2(dict_domain_info)
150 for atr in dict_domain_info.keys():
151 os = m2(dict_domain_info[atr][1],dict_domain_info[atr][2],
152 pos_dif,neg_dif,neg_pond)
153 if (abs(os) >= threshold and
154 dict_domain_info[atr][1]+dict_domain_info[atr][2] >= minimum_ocurrences):
155 if os < 0 :
156 total_neg += 1
157 if os > 0:
158 total_pos +=1
159 dict_clear_polarity_attributes[atr] = os
160
161 list_clear_polarity_attributes = sorted(dict_clear_polarity_attributes,
162 key = dict_domain_info.get)
163 p_atr,n_atr = total_pos*p, total_neg*n
164 p_included,n_included = 0,0
165 self._domain_dict ={}
166 for atr in list_clear_polarity_attributes:
167 os = dict_clear_polarity_attributes[atr]
168 if os > 0 and p_included < p_atr:
169 self._domain_dict[atr] = os
170 p_included+=1
171 if os < 0 and n_included < n_atr:
172 self._domain_dict[atr] = os
173 n_included+=1
174
175 if p_included >= p_atr and n_included >= n_atr:
176 break
177
178
179
180
182 return lemma in self._int_dict.keys()
183
184
186 """
187 @param semantic_category: A value in the collection {'n','a','v','r','i'}, 'n' is a noun, 'a' is an adjetive, 'v' is a verb, 'r' is an adverb and 'i' is an intensifier.
188 @raise UnknownSOException: Raises this exception if term hasn't semantic orientation
189 @return: Semantic orientation of a word
190 """
191
192 so = 0
193
194 if semantic_category == SemanticCategory.ADJECTIVE and self._adj_dict.has_key(lemma):
195 so = self._adj_dict.get(lemma)
196 if semantic_category == SemanticCategory.NOUN and self._noun_dict.has_key(lemma):
197 so = self._noun_dict.get(lemma)
198 if semantic_category == SemanticCategory.VERB and self._verb_dict.has_key(lemma):
199 so = self._verb_dict.get(lemma)
200 if semantic_category == SemanticCategory.ADVERB and self._adv_dict.has_key(lemma):
201 so = self._adv_dict.get(lemma)
202 if semantic_category == SemanticCategory.INTENSIFIER and self._int_dict.has_key(lemma):
203 so = self._int_dict.get(lemma)
204
205 if (self._domain_dict is not None and
206 self._domain_dict.has_key(lemma) and
207 self._domain_dict.get(lemma) and
208 semantic_category in [SemanticCategory.NOUN, SemanticCategory.ADJECTIVE,
209 SemanticCategory.VERB, SemanticCategory.ADVERB]):
210 so = self._domain_dict.get(lemma)
211
212
213 if so == 0:
214 raise UnknownSOException("Term hasn't semantic orientation")
215 else:
216 return so*(1 + self._neg_pond*(so<0))
217
218
219
222
224 """
225 @param token: A token
226 @param lexical_category: The lexical category of the token
227 @return: The lemma of the token
228 """
229 try:
230 return self._lemmas[lexical_category][token]
231 except:
232 try:
233 return self._lemmas[lexical_category][token.lower()]
234 except:
235 try:
236 return self._heuristic_lemma(lexical_category, token)
237 except:
238
239
240 return token
241
243 """
244 @param lexical_category: The lexical category of the word
245 @param word: A token
246 @raise KeyError: If processed token isn't at lemmas dictionary
247 @return: A possible lemma of the token
248 """
249 if lexical_category in ['a']:
250 if word.endswith("s"):
251 return self._lemmas[lexical_category][word[0:len(word)-1]]
252
253 if word.endswith("a"):
254 try:
255 return self._lemmas[lexical_category][word[0:len(word)-1]+"o"]
256 except:
257 if word.endswith("ita"):
258 return self._lemmas[lexical_category][word[0:len(word)-3]+"a"]
259 else:
260 raise Exception
261
262 if lexical_category == 'v':
263 if word.endswith("se") or word.endswith("le") or word.endswith("me") or word.endswith("lo") or word.endswith("la"):
264 return self._lemmas[lexical_category][word[0:len(word)-2]]
265 if word.endswith("nos") or word.endswith("les"):
266 return self._lemmas[lexical_category][word[0:len(word)-3]]
267
268 if lexical_category == 'n':
269 if word.endswith("ita"):
270
271 if word.endswith("quita"):
272 return self._lemmas[lexical_category][word[0:len(word)-5]+"ca"]
273 elif word.endswith("ecita"):
274 return self._lemmas[lexical_category][word[0:len(word)-5]+"a"]
275 else:
276 return self._lemmas[lexical_category][word[0:len(word)-3]+"a"]
277
278 if word.endswith("ito"):
279 if word.endswith("quito"):
280 return self._lemmas[lexical_category][word[0:len(word)-5]+"co"]
281 elif word.endswith("cito"):
282 return self._lemmas[lexical_category][word[0:len(word)-4]]
283 else:
284 return self._lemmas[lexical_category][word[0:len(word)-3]+"o"]
285
286 return self._lemmas[lexical_category][word]
287