Package miopia :: Package analyzer :: Module Dictionary
[hide private]
[frames] | no frames]

Source Code for Module miopia.analyzer.Dictionary

  1  #-*- coding: utf-8 -*- 
  2  ''' 
  3  @author: David Vilares Calvo 
  4  ''' 
  5  from collections import defaultdict 
  6  from miopia.util.ConfigurationManager import ConfigurationManager 
  7  from miopia.util.exceptions.UnknownSOException import * 
  8  from miopia.analyzer.SemanticCategory import SemanticCategory 
  9  from miopia.analyzer.PsychometricDictionary import PsychometricDictionary 
 10  import codecs 
 11  import os 
 12  import math 
 13  import warnings  
 14   
15 -class Dictionary(object):
16 ''' 17 A class for obtaining the semantic resources provided with MIOPIA 18 ''' 19 __uniqueInstance = None 20
21 - def __init__(self, noun={}, adj={}, adv={}, verb={}, intd={}, 22 lemmas={}, 23 p_dict=None):
24 ''' 25 @param noun: nouns 26 @param adj: adjectives 27 @param adv: adverbs 28 @param verb: verbs 29 @param int: intensifiers 30 @param lemmas: lemmas 31 @param p_dict: A L{PsychometricDictionary} 32 ''' 33 34 if(noun == adj == adv == verb == intd == lemmas == {} and p_dict == None): 35 warnings.warn("deprecated: dictionaries should be passed as arguments",DeprecationWarning) 36 37 encoding_dict = "ISO-8859-15" 38 path = ConfigurationManager(lang='es').getParameter("path_SODictionaries") 39 self._noun_dict = self._get_words(path+"/ciao+SD_noun_dict_spa.txt",encoding_dict) 40 self._adj_dict = self._get_words(path+"/ciao+SD_adj_dict_spa.txt",encoding_dict) 41 self._adv_dict = self._get_words(path+"/ciao+SD_adv_dict_spa.txt",encoding_dict) 42 self._verb_dict = self._get_words(path+"/ciao+SD_verb_dict_spa.txt",encoding_dict) 43 self._int_dict = self._get_words(path+"/int_dict_spa.txt",encoding_dict) 44 path = ConfigurationManager().getParameter("path_lemmas") 45 self._lemmas = self._build_lemmas_dictionary(path) 46 47 #The psychometric dictionary 48 p_dict = PsychometricDictionary() 49 p_dict.readFromFile(ConfigurationManager().getParameter("path_lwic_dictionary")) 50 self._psychometric_dict = p_dict 51 else: 52 self._noun_dict = noun 53 self._adj_dict = adj 54 self._adv_dict = adv 55 self._verb_dict = verb 56 self._int_dict = intd 57 self._lemmas = lemmas #self._build_lemmas_dictionary(path) 58 if(p_dict != None): 59 self._psychometric_dict = p_dict 60 61 self._domain_dict = None 62 self._neg_pond = 0
63
64 - def _get_words(self,name_of_dict,coding="utf-8"):
65 """ 66 @param name_of_dict: The path to a semantic orientation dictionary 67 @return: A dictionary {word,semantic orientation value} 68 @deprecated: dictionaries should be passed as arguments to the constructor 69 """ 70 words = codecs.open(name_of_dict,encoding=coding).readlines() 71 D = {} 72 for word in words: 73 columns = word.split() 74 if len(columns) == 2: 75 if not D.has_key(columns[0]) and columns[0] != '': 76 D[columns[0]] = float(columns[1]) 77 return D
78
79 - def _build_lemmas_dictionary(self,lemmas_file):
80 """ 81 Build a nested dictionary D[lexical_category][token] to get lemmas 82 @param lemmas_file: A path to the lemmas dict file 83 @precondition: lemmas dict must has this format: CrossTag\tToken\tLemma 84 @return: A nested dictionary with the lemmas 85 """ 86 87 words = codecs.open(lemmas_file,encoding="utf-8").readlines() 88 89 D = defaultdict(defaultdict) 90 for word in words: 91 columns = word.split('\t') 92 if len(columns) == 3: 93 D[columns[0]][columns[1]] = columns[2][0:len(columns[2])-1] 94 return D
95
96 - def adapt(self,dict_domain_info,p,n,neg_pond=0,threshold=0.5,minimum_ocurrences=1):
97 """ 98 @param dict_domain_info: A dictionary returned by the L{DomainAdaptor}. 99 The attribute form is the key. The value is a tuple 100 (avg_position_in_raking, ocurrences_in_pos_files, ocurrences_in_negatives_files). 101 @param p: An integer between 0 and 1. The proportion of positive words included in the dictionary. 102 @param n: An integer between 0 and 1. The proportion of negative words included in the dictionary. 103 @param neg_pond: Weighting for negative words. 104 @param threshold: 105 @param minimum_ocurrences: 106 """ 107 108 # def m2(atr_pos,atr_neg,max_pos_dif, min_neg_dif, neg_ponderation): 109 # new_end = 5.0 110 # relax = 1.0 111 # 112 # if atr_pos > atr_neg: 113 # return (( math.log( (atr_pos + relax) / (atr_neg + relax) ,2)) 114 # / max_pos_dif )*new_end 115 # else: 116 # return (( math.log( (atr_neg + relax) / (atr_pos + relax) ,2)) 117 # / min_neg_dif)*-(new_end*neg_ponderation) 118 119 self._neg_pond = neg_pond 120 def m2(atr_pos,atr_neg,max_pos_dif, min_neg_dif, neg_ponderation): 121 new_end = 5.0 122 relax = 1.0 123 124 if atr_pos > atr_neg: 125 return (( math.log( (atr_pos + relax) / (atr_neg + relax) ,2)) 126 / max_pos_dif )*new_end 127 else: 128 return (( math.log( (atr_neg + relax) / (atr_pos + relax) ,2)) 129 / min_neg_dif)*-(new_end)
130 131 132 133 def get_max_diffs_log2(dict_atr): 134 pos_max_value = 0.0 135 neg_max_value = 0.0 136 relax = 1.0 137 138 for atr in dict_atr.keys(): 139 pos_max_value = max(pos_max_value, math.log(float(dict_atr[atr][1] +relax) / 140 float(dict_atr[atr][2] + relax),2)) 141 neg_max_value = max(neg_max_value, math.log(float(dict_atr[atr][2] +relax) / 142 float(dict_atr[atr][1] + relax),2)) 143 return (pos_max_value,neg_max_value)
144 145 146 total_pos = 0 147 total_neg = 0 148 dict_clear_polarity_attributes = {} 149 (pos_dif,neg_dif) = get_max_diffs_log2(dict_domain_info) 150 for atr in dict_domain_info.keys(): 151 os = m2(dict_domain_info[atr][1],dict_domain_info[atr][2], 152 pos_dif,neg_dif,neg_pond) 153 if (abs(os) >= threshold and 154 dict_domain_info[atr][1]+dict_domain_info[atr][2] >= minimum_ocurrences): 155 if os < 0 : 156 total_neg += 1 157 if os > 0: 158 total_pos +=1 159 dict_clear_polarity_attributes[atr] = os 160 161 list_clear_polarity_attributes = sorted(dict_clear_polarity_attributes, 162 key = dict_domain_info.get) 163 p_atr,n_atr = total_pos*p, total_neg*n 164 p_included,n_included = 0,0 165 self._domain_dict ={} 166 for atr in list_clear_polarity_attributes: 167 os = dict_clear_polarity_attributes[atr] 168 if os > 0 and p_included < p_atr: 169 self._domain_dict[atr] = os 170 p_included+=1 171 if os < 0 and n_included < n_atr: 172 self._domain_dict[atr] = os 173 n_included+=1 174 175 if p_included >= p_atr and n_included >= n_atr: 176 break 177 # fwrite = open("/tmp/domain_dictSEEDICT.txt","w") 178 # for key in self._domain_dict.keys(): 179 # fwrite.write(key+"\t"+str(self._domain_dict[key])+"\n") 180
181 - def is_intensifier_term(self,lemma):
182 return lemma in self._int_dict.keys()
183 184
185 - def get_semantic_orientation(self,lemma,semantic_category):
186 """ 187 @param semantic_category: A value in the collection {'n','a','v','r','i'}, 'n' is a noun, 'a' is an adjetive, 'v' is a verb, 'r' is an adverb and 'i' is an intensifier. 188 @raise UnknownSOException: Raises this exception if term hasn't semantic orientation 189 @return: Semantic orientation of a word 190 """ 191 192 so = 0 #Semantic orientation of a word 193 194 if semantic_category == SemanticCategory.ADJECTIVE and self._adj_dict.has_key(lemma): 195 so = self._adj_dict.get(lemma) 196 if semantic_category == SemanticCategory.NOUN and self._noun_dict.has_key(lemma): 197 so = self._noun_dict.get(lemma) 198 if semantic_category == SemanticCategory.VERB and self._verb_dict.has_key(lemma): 199 so = self._verb_dict.get(lemma) 200 if semantic_category == SemanticCategory.ADVERB and self._adv_dict.has_key(lemma): 201 so = self._adv_dict.get(lemma) 202 if semantic_category == SemanticCategory.INTENSIFIER and self._int_dict.has_key(lemma): 203 so = self._int_dict.get(lemma) 204 205 if (self._domain_dict is not None and 206 self._domain_dict.has_key(lemma) and 207 self._domain_dict.get(lemma) and 208 semantic_category in [SemanticCategory.NOUN, SemanticCategory.ADJECTIVE, 209 SemanticCategory.VERB, SemanticCategory.ADVERB]): 210 so = self._domain_dict.get(lemma) 211 212 213 if so == 0: 214 raise UnknownSOException("Term hasn't semantic orientation") 215 else: 216 return so*(1 + self._neg_pond*(so<0))
217 218 219
220 - def get_psychometric_categories(self,str_word):
221 return self._psychometric_dict.get_psychometric_categories(str_word)
222
223 - def get_lemma(self,lexical_category,token):
224 """ 225 @param token: A token 226 @param lexical_category: The lexical category of the token 227 @return: The lemma of the token 228 """ 229 try: 230 return self._lemmas[lexical_category][token] 231 except: 232 try: 233 return self._lemmas[lexical_category][token.lower()] 234 except: 235 try: 236 return self._heuristic_lemma(lexical_category, token) 237 except: 238 #XXX: In SFU None was returned 239 #return None 240 return token
241
242 - def _heuristic_lemma(self,lexical_category, word):
243 """ 244 @param lexical_category: The lexical category of the word 245 @param word: A token 246 @raise KeyError: If processed token isn't at lemmas dictionary 247 @return: A possible lemma of the token 248 """ 249 if lexical_category in ['a']: 250 if word.endswith("s"): 251 return self._lemmas[lexical_category][word[0:len(word)-1]] 252 253 if word.endswith("a"): 254 try: 255 return self._lemmas[lexical_category][word[0:len(word)-1]+"o"] 256 except: 257 if word.endswith("ita"): 258 return self._lemmas[lexical_category][word[0:len(word)-3]+"a"] 259 else: 260 raise Exception 261 262 if lexical_category == 'v': 263 if word.endswith("se") or word.endswith("le") or word.endswith("me") or word.endswith("lo") or word.endswith("la"): 264 return self._lemmas[lexical_category][word[0:len(word)-2]] 265 if word.endswith("nos") or word.endswith("les"): 266 return self._lemmas[lexical_category][word[0:len(word)-3]] 267 268 if lexical_category == 'n': 269 if word.endswith("ita"): 270 271 if word.endswith("quita"): 272 return self._lemmas[lexical_category][word[0:len(word)-5]+"ca"] 273 elif word.endswith("ecita"): 274 return self._lemmas[lexical_category][word[0:len(word)-5]+"a"] 275 else: 276 return self._lemmas[lexical_category][word[0:len(word)-3]+"a"] 277 278 if word.endswith("ito"): 279 if word.endswith("quito"): 280 return self._lemmas[lexical_category][word[0:len(word)-5]+"co"] 281 elif word.endswith("cito"): 282 return self._lemmas[lexical_category][word[0:len(word)-4]] 283 else: 284 return self._lemmas[lexical_category][word[0:len(word)-3]+"o"] 285 286 return self._lemmas[lexical_category][word]
287