1 '''
2 Created on 13/03/2014
3
4 @author: David Vilares
5 '''
6
7 from miopia.analyzer.counter.RawCounter import RawCounter
8 from miopia.util.TokenInfo import TokenInfo
9
11
12 '''
13 Abstract class to manage counters which use external knowledge to obtain
14 features for the supervised classifier.
15 '''
16
17 - def __init__(self, ftc, preprocessor, lexical_processor,
18 dict_lexicon, lowercase =True):
19
20 '''
21 @param ftc: An instance of L{FeatureTypeConfiguration}
22 @param preprocessor: An instance of L{PreprocessorI}
23 @param lexical_processor: An instance of L{LexicalProcessor}
24 @param dict_lexicon: A nested dictionary {key:[Category,{key:[ ... ]}]}
25 @param lowercase: A boolean. True to ignore capitalised characters.
26 '''
27 super(LexiconCounter,self).__init__(ftc,lowercase)
28 self._preprocessor = preprocessor
29 self._lexical_processor = lexical_processor
30 self._dict_lexicon = dict_lexicon
31
32
33 - def _count(self,list_text_info):
34 """
35 @param list_text_info: A list of L{TextInfo} objects
36 """
37 dict_features = {}
38 tags = self.raw_processing(list_text_info)
39
40 list_textid_list_token_info =[]
41 for textid, list_sentences_word_tag in tags:
42 l_tokeninfo_text = []
43 for sentence_word_tag in list_sentences_word_tag:
44 l_tokeninfo_text.extend([TokenInfo(self._filter(word),None, infotag.get_cpostag(), infotag.get_postag())
45 for word,infotag in sentence_word_tag])
46 list_textid_list_token_info.append((textid, l_tokeninfo_text))
47
48 for textid, list_token_info in list_textid_list_token_info:
49 abstractions = self._find_values(textid,list_token_info[0],
50 list_token_info[1:],1,1,
51 self._dict_lexicon)
52 for abstraction in abstractions:
53 try:
54 dict_features[abstraction]+= abstractions[abstraction]
55 except KeyError:
56 dict_features[abstraction]= abstractions[abstraction]
57 return dict_features
58
59
61 return (len(dict_expressions) == 0 or
62 (list_token_info != [] and not dict_expressions.has_key(list_token_info[0].get_form())))
63
64 - def _find_values(self, textid,token_info, sublist_token_info,
65 initial_position, current_position, subdictionary):
66
67 try:
68
69 values = subdictionary[token_info.get_form()]
70 d = {}
71 values_non_dict = [value for value in values if type(value)!=type({})]
72
73 values_dict = {}
74 for value in values:
75 if type(value) == type({}):
76 values_dict.update(value)
77
78 if self._is_terminal_value(values_dict, sublist_token_info):
79 for value in values_non_dict:
80 d = self._get_values(d,textid,initial_position,value)
81 if sublist_token_info != []:
82 sub_d = self._find_values(textid,sublist_token_info[0],sublist_token_info[1:],
83 initial_position, current_position+1,values_dict)
84 for key in sub_d:
85 try:
86 d[key]+= sub_d[key]
87 except KeyError:
88 d[key]= sub_d[key]
89 return d
90
91 except KeyError:
92
93 if sublist_token_info == []:
94 return {}
95 else:
96 return self._find_values(textid,sublist_token_info[0],sublist_token_info[1:],
97 current_position+1,current_position+1,
98 self._dict_lexicon)
99