Package miopia :: Package analyzer :: Package counter :: Module LexiconCounter
[hide private]
[frames] | no frames]

Source Code for Module miopia.analyzer.counter.LexiconCounter

  1  ''' 
  2  Created on 13/03/2014 
  3   
  4  @author: David Vilares 
  5  ''' 
  6   
  7  from miopia.analyzer.counter.RawCounter import RawCounter 
  8  from miopia.util.TokenInfo import TokenInfo 
  9   
10 -class LexiconCounter(RawCounter):
11 12 ''' 13 Abstract class to manage counters which use external knowledge to obtain 14 features for the supervised classifier. 15 ''' 16
17 - def __init__(self, ftc, preprocessor, lexical_processor, 18 dict_lexicon, lowercase =True):
19 20 ''' 21 @param ftc: An instance of L{FeatureTypeConfiguration} 22 @param preprocessor: An instance of L{PreprocessorI} 23 @param lexical_processor: An instance of L{LexicalProcessor} 24 @param dict_lexicon: A nested dictionary {key:[Category,{key:[ ... ]}]} 25 @param lowercase: A boolean. True to ignore capitalised characters. 26 ''' 27 super(LexiconCounter,self).__init__(ftc,lowercase) 28 self._preprocessor = preprocessor 29 self._lexical_processor = lexical_processor 30 self._dict_lexicon = dict_lexicon
31 32
33 - def _count(self,list_text_info):
34 """ 35 @param list_text_info: A list of L{TextInfo} objects 36 """ 37 dict_features = {} 38 tags = self.raw_processing(list_text_info) 39 40 list_textid_list_token_info =[] 41 for textid, list_sentences_word_tag in tags: 42 l_tokeninfo_text = [] 43 for sentence_word_tag in list_sentences_word_tag: 44 l_tokeninfo_text.extend([TokenInfo(self._filter(word),None, infotag.get_cpostag(), infotag.get_postag()) 45 for word,infotag in sentence_word_tag]) 46 list_textid_list_token_info.append((textid, l_tokeninfo_text)) 47 48 for textid, list_token_info in list_textid_list_token_info: 49 abstractions = self._find_values(textid,list_token_info[0], 50 list_token_info[1:],1,1, 51 self._dict_lexicon) 52 for abstraction in abstractions: 53 try: 54 dict_features[abstraction]+= abstractions[abstraction] 55 except KeyError: 56 dict_features[abstraction]= abstractions[abstraction] 57 return dict_features
58 59
60 - def _is_terminal_value(self, dict_expressions, list_token_info):
61 return (len(dict_expressions) == 0 or 62 (list_token_info != [] and not dict_expressions.has_key(list_token_info[0].get_form())))
63
64 - def _find_values(self, textid,token_info, sublist_token_info, 65 initial_position, current_position, subdictionary):
66 67 try: 68 69 values = subdictionary[token_info.get_form()] 70 d = {} 71 values_non_dict = [value for value in values if type(value)!=type({})] 72 73 values_dict = {} 74 for value in values: 75 if type(value) == type({}): 76 values_dict.update(value) 77 78 if self._is_terminal_value(values_dict, sublist_token_info): #It's the longest match 79 for value in values_non_dict: 80 d = self._get_values(d,textid,initial_position,value) 81 if sublist_token_info != []: #Taking next token 82 sub_d = self._find_values(textid,sublist_token_info[0],sublist_token_info[1:], 83 initial_position, current_position+1,values_dict) 84 for key in sub_d: 85 try: 86 d[key]+= sub_d[key] 87 except KeyError: 88 d[key]= sub_d[key] 89 return d 90 91 except KeyError: 92 #SALE pasamos al siguiente tokeninfo para buscar sus abstracciones 93 if sublist_token_info == []: 94 return {} 95 else: 96 return self._find_values(textid,sublist_token_info[0],sublist_token_info[1:], 97 current_position+1,current_position+1, 98 self._dict_lexicon)
99