1 '''
2 Created on 10/04/2013
3
4 @author: David Vilares
5 '''
6
7 import os
8 import codecs
9 from miopia.adapter.Feature import FeatureTypeConfiguration
10 from miopia.analyzer.counter.Counter import Counter
11 from miopia.util.ConfigurationManager import ConfigurationManager
12 from miopia.adapter.Feature import FeatureType
13 from collections import defaultdict
14
16 """
17 This class provides information about a feature for a supervised classifier
18 """
19
20 - def __init__(self,feature,ranking,information_gain,feature_type):
21 """
22 @param feature: The idenfier of the feature. A string
23 @param ranking: The ranking of the feature with respect to the others. A float
24 @param information_gain: A float indicating the information gain provided by the feature
25 @param feature_type: A constant of L{FeatureType} which represents the type of the feature
26 """
27 self._feature = feature
28 self._ranking = ranking
29 self._information_gain = information_gain
30 self._feature_type = type
31
33 """
34 @return: A string with the feature
35 """
36 return self._feature
37
39 """
40 @return: A float with the ranking of the feature
41 """
42 return self._ranking
43
45 """
46 @return: An constant of L{FeatureType} which represents the type of the feature
47 """
48 return self._feature_type
49
55
56
57
59
60 '''
61 Adapter is an abstract class which defines the interface for different features Adapter's
62 provided in the package adapter
63 '''
64
65 BINARY_WEIGHTING_FACTOR = "BW"
66 TOTAL_WEIGHTING_FACTOR = "TW"
67
68 DELIMITER_FEATURE_TYPE_AND_LIST_FEATURES = ":"
69 ALLOWED_JAVA_HEAP_XMX = ConfigurationManager().getParameter("allowed_java_heap")
70
71
74 '''
75 Constructor
76 @param path_weka: The path to the WEKA.jar
77 @param abstracted_lexicons_counter: An instance of L{Counter}
78 @param weight_factor: A value {BINARY_WEIGHTING_FACTOR, TOTAL_WEIGHTING_FACTOR}
79 '''
80 self._counter = counter
81 if path_weka is None: self._path_weka =''
82 else: self._path_weka = path_weka
83
84 self._weighting_factor = weight_factor
85
86
87
88
91
93 return self._weighting_factor
94
95
96
97
98
99
100
101
103 """
104 @param list_text_info: A list of L{TextInfo}
105 @param A dictionary with the features detected
106 """
107
108 dict_features = self._counter.count(list_text_info)
109 dict_long_title_features = {}
110
111 str_feature_type_configuration = str(self._counter.get_feature_type_configuration())
112 dict_long_title_features.update({(str(self._get_feature_type())
113 ,str_feature_type_configuration
114 ,feature):dict_features.get(feature)
115 for feature in dict_features.keys()})
116
117 return dict_long_title_features
118
119
120
122 """
123 @param arff_file: The path to an arff file
124 @return The classes considered in hat arff
125 """
126
127 lines_aux = codecs.open(arff_file,"r").read().split(
128 '@DATA')[0].split('\n')
129 line_classes = lines_aux[len(lines_aux)-2]
130 return line_classes.split()[2].replace('{','').replace('}','').split(',')
131
132
133
134
136
137 """
138 @param expanded_id: An identifier to know the location of a features inside a file and a L{SentimentDependencyGraph}
139 @todo: Change this in the future
140 """
141 return expanded_id[0]+expanded_id[1]+self._counter.name_from_id(expanded_id[2])
142
143
144 - def to_arff(self,list_text_info,arff_file, arff_header=None, is_test_set = False):
145 """
146 @param list_text_info: A list of L{TextInfo}
147 @param arff_file: A string. The path to the destination of the data represented in the ARFF format.
148 @param arff_header: A string containing an arff header, which will indicate the features
149 that will be taken into account. None if no header specified.
150 @param is_test_set: A boolean. True if it's a test set. False otherwise.
151 """
152 farff = codecs.open(arff_file,"w")
153
154 dict_features = self._proccess(list_text_info, is_test_set)
155
156
157
158
159 if arff_header is None:
160 classes = set([text_info.get_category() for text_info in list_text_info])
161
162
163 arff_header=self._arff_header(dict_features,classes)
164 farff.write(arff_header)
165 farff.flush()
166 dict_feature_position = self._features_from_arff_header(arff_header)
167
168 arff_data, dict_position_instanceid = self._arff_data(dict_features,
169 dict_feature_position, list_text_info)
170 farff.write(arff_data)
171 farff.flush()
172 farff.close()
173 return dict_position_instanceid
174
175
176
178 """
179 @param arff_file: The path to an ARFF file
180 @return A string. The ARFF header of the arff_file
181 """
182 farff = codecs.open(arff_file)
183 arff_file_str = farff.read()
184 ending_index = arff_file_str.find('@DATA')
185 return arff_file_str[0:ending_index+6]
186
187
188
189 - def _proccess(self, list_text_info, is_test_set):
190 dict_features = self.count_features(list_text_info)
191 return dict_features
192
193
195 """
196 @param arff_header: A string. an ARFF header
197 @return A dictionary of the features considered in the ARFF header and their with their
198 corresponding position in the file.
199 """
200 position = 0
201 dict_features = {}
202 lines_aux = arff_header.split('\n')
203 lines_attributes = lines_aux[1:len(lines_aux)-2]
204 for l in lines_attributes:
205 l_splitted =l.split()
206 feature = l_splitted[1][1:len(l_splitted[1])-1].replace("\\'","'")
207 dict_features[feature] = position
208
209 position+=1
210 return dict_features
211
212
214 """
215 @para dict_features: The dictionary of features provided by the count method of L{Counter}.
216 @param classes: The classes considered by the classifier.
217 @return A string. The ARFF header.
218 """
219 list_features = []
220 arff_header = "@RELATION Polarity\n"
221
222 for feature in dict_features:
223 str_feature = self._long_id_feature_to_string(feature)
224
225 _, ftc, _ = self._get_feature_configuration(str_feature)
226 if str_feature not in list_features:
227 list_features.append(str_feature)
228 arff_header+= "@ATTRIBUTE '"+str_feature.encode('utf-8').replace("'","\\'")+"' "+ftc.get_weka_data_type()+"\n"
229
230 arff_header+="@ATTRIBUTE 'class' {"+','.join(classes)+"}\n"
231 arff_header+="@DATA\n"
232 return arff_header
233
235 """
236 @param str_ftc. A string. The representation of an instance of L{FeatureTypeConfiguration}
237 @param value: An integer.
238 @return A weighted value according to the weighting factor employed by the current adapter
239 """
240
241 if self._weighting_factor == self.BINARY_WEIGHTING_FACTOR:
242 return 1
243 if self._weighting_factor == self.TOTAL_WEIGHTING_FACTOR:
244 return value
245
246 - def _arff_data(self, dict_features, dict_feature_position, list_text_info):
247 """
248 @para dict_features: The dictionary of features provided by the count method of L{Counter}.
249 @param classes: The classes considered by the classifier.
250 @return A string. The ARFF header.
251 """
252
253 WEKA_RESERVED_WORDS = ['class']
254 dict_features_file = defaultdict(defaultdict)
255 dict_instanceid_position = {}
256 position = 0
257 for feature in dict_features:
258 feature_id = feature[2]
259 feature_type = feature[0]
260 str_feature = self._long_id_feature_to_string(feature)
261 if str_feature in WEKA_RESERVED_WORDS:
262 continue
263 file_path = self._counter.file_id(feature_id)
264 value = dict_features[feature]
265
266 try:
267 dict_features_file[file_path][str_feature] = self.get_weighting_value(feature_type,
268 dict_features_file[file_path][str_feature]+ value)
269
270 except KeyError:
271 dict_features_file[file_path][str_feature] = self.get_weighting_value(feature_type,
272 value)
273
274
275 arff_data = ""
276
277 for text_info in list_text_info:
278 open_symbol = "{"
279 close_symbol = "}"
280
281 textid = text_info.get_textid()
282 keys = dict_features_file[textid].keys()
283 keys = list(set(keys).intersection(set(dict_feature_position.keys())))
284 keys.sort(key = lambda x: dict_feature_position[x])
285 for feature_in_text in keys:
286 arff_data+=open_symbol+str(dict_feature_position[feature_in_text])+" "+str(dict_features_file[textid][feature_in_text])+","
287 open_symbol = ""
288 arff_data+=open_symbol+str(dict_feature_position['class'])+" "+text_info.get_category()+close_symbol+"\n"
289 dict_instanceid_position[position] = textid
290 position+=1
291
292 return arff_data, dict_instanceid_position
293
294
336