Package miopia :: Package adapter :: Module Adapter
[hide private]
[frames] | no frames]

Source Code for Module miopia.adapter.Adapter

  1  ''' 
  2  Created on 10/04/2013 
  3   
  4  @author: David Vilares 
  5  ''' 
  6   
  7  import os 
  8  import codecs 
  9  from miopia.adapter.Feature import FeatureTypeConfiguration 
 10  from miopia.analyzer.counter.Counter import Counter 
 11  from miopia.util.ConfigurationManager import ConfigurationManager 
 12  from miopia.adapter.Feature import FeatureType 
 13  from collections import defaultdict 
 14   
15 -class FeatureInfo(object):
16 """ 17 This class provides information about a feature for a supervised classifier 18 """ 19
20 - def __init__(self,feature,ranking,information_gain,feature_type):
21 """ 22 @param feature: The idenfier of the feature. A string 23 @param ranking: The ranking of the feature with respect to the others. A float 24 @param information_gain: A float indicating the information gain provided by the feature 25 @param feature_type: A constant of L{FeatureType} which represents the type of the feature 26 """ 27 self._feature = feature 28 self._ranking = ranking 29 self._information_gain = information_gain 30 self._feature_type = type
31
32 - def get_feature(self):
33 """ 34 @return: A string with the feature 35 """ 36 return self._feature
37
38 - def get_ranking(self):
39 """ 40 @return: A float with the ranking of the feature 41 """ 42 return self._ranking
43
44 - def get_feature_type(self):
45 """ 46 @return: An constant of L{FeatureType} which represents the type of the feature 47 """ 48 return self._feature_type
49
50 - def get_information_gain(self):
51 """ 52 @return: A float 53 """ 54 return self.get_information_gain()
55 56 57
58 -class Adapter(object):
59 #TODO: Any adapter can read any adapter ranking file , is this good? 60 ''' 61 Adapter is an abstract class which defines the interface for different features Adapter's 62 provided in the package adapter 63 ''' 64 65 BINARY_WEIGHTING_FACTOR = "BW" 66 TOTAL_WEIGHTING_FACTOR = "TW" 67 68 DELIMITER_FEATURE_TYPE_AND_LIST_FEATURES = ":" 69 ALLOWED_JAVA_HEAP_XMX = ConfigurationManager().getParameter("allowed_java_heap") 70 71
72 - def __init__(self,path_weka, counter, 73 weight_factor = BINARY_WEIGHTING_FACTOR):
74 ''' 75 Constructor 76 @param path_weka: The path to the WEKA.jar 77 @param abstracted_lexicons_counter: An instance of L{Counter} 78 @param weight_factor: A value {BINARY_WEIGHTING_FACTOR, TOTAL_WEIGHTING_FACTOR} 79 ''' 80 self._counter = counter 81 if path_weka is None: self._path_weka ='' 82 else: self._path_weka = path_weka 83 84 self._weighting_factor = weight_factor
85 86 # 87 88
89 - def get_counter(self):
90 return self._counter
91
92 - def get_weighting_factor(self):
93 return self._weighting_factor
94 95 # def _get_feature_type(self): 96 # """ 97 # The type of the feature 98 # """ 99 # return FeatureType.GENERIC 100 101
102 - def count_features(self,list_text_info):
103 """ 104 @param list_text_info: A list of L{TextInfo} 105 @param A dictionary with the features detected 106 """ 107 #dict_features = self._count_features(list_text_info) 108 dict_features = self._counter.count(list_text_info) 109 dict_long_title_features = {} 110 111 str_feature_type_configuration = str(self._counter.get_feature_type_configuration()) 112 dict_long_title_features.update({(str(self._get_feature_type()) 113 ,str_feature_type_configuration 114 ,feature):dict_features.get(feature) 115 for feature in dict_features.keys()}) 116 117 return dict_long_title_features
118 119 120
121 - def classes_from_arff(self,arff_file):
122 """ 123 @param arff_file: The path to an arff file 124 @return The classes considered in hat arff 125 """ 126 127 lines_aux = codecs.open(arff_file,"r").read().split( 128 '@DATA')[0].split('\n') 129 line_classes = lines_aux[len(lines_aux)-2] 130 return line_classes.split()[2].replace('{','').replace('}','').split(',')
131 132 133 134
135 - def _long_id_feature_to_string(self,expanded_id):
136 #TODO: Change Counter() 137 """ 138 @param expanded_id: An identifier to know the location of a features inside a file and a L{SentimentDependencyGraph} 139 @todo: Change this in the future 140 """ 141 return expanded_id[0]+expanded_id[1]+self._counter.name_from_id(expanded_id[2])
142 143
144 - def to_arff(self,list_text_info,arff_file, arff_header=None, is_test_set = False):
145 """ 146 @param list_text_info: A list of L{TextInfo} 147 @param arff_file: A string. The path to the destination of the data represented in the ARFF format. 148 @param arff_header: A string containing an arff header, which will indicate the features 149 that will be taken into account. None if no header specified. 150 @param is_test_set: A boolean. True if it's a test set. False otherwise. 151 """ 152 farff = codecs.open(arff_file,"w") 153 #dict_features, dict_hash_dg_file_id, list_file_category = self._proccess(list_text_info, is_test_set) 154 dict_features = self._proccess(list_text_info, is_test_set) 155 # print "Entra",{}, len(list_text_info) 156 # #It's necessary to calculate the header 157 # for key in dict_features.keys(): 158 # print key, dict_features[key] 159 if arff_header is None: 160 classes = set([text_info.get_category() for text_info in list_text_info]) 161 #print classes 162 #classes = os.listdir(list_text_info) 163 arff_header=self._arff_header(dict_features,classes) 164 farff.write(arff_header) 165 farff.flush() 166 dict_feature_position = self._features_from_arff_header(arff_header) 167 #Calculate arff_data 168 arff_data, dict_position_instanceid = self._arff_data(dict_features, 169 dict_feature_position, list_text_info)#, list_file_category) 170 farff.write(arff_data) 171 farff.flush() 172 farff.close() 173 return dict_position_instanceid
174 175 176
177 - def arff_header_from_arff_file(self, arff_file):
178 """ 179 @param arff_file: The path to an ARFF file 180 @return A string. The ARFF header of the arff_file 181 """ 182 farff = codecs.open(arff_file) 183 arff_file_str = farff.read() 184 ending_index = arff_file_str.find('@DATA') 185 return arff_file_str[0:ending_index+6]
186 187 188
189 - def _proccess(self, list_text_info, is_test_set):
190 dict_features = self.count_features(list_text_info) 191 return dict_features#,# dict_hash_textinfo_file_id#, list_category
192 193
194 - def _features_from_arff_header(self,arff_header):
195 """ 196 @param arff_header: A string. an ARFF header 197 @return A dictionary of the features considered in the ARFF header and their with their 198 corresponding position in the file. 199 """ 200 position = 0 201 dict_features = {} 202 lines_aux = arff_header.split('\n') 203 lines_attributes = lines_aux[1:len(lines_aux)-2] 204 for l in lines_attributes: 205 l_splitted =l.split() 206 feature = l_splitted[1][1:len(l_splitted[1])-1].replace("\\'","'") 207 dict_features[feature] = position 208 # print l_splitted,feature, position 209 position+=1 210 return dict_features
211 212
213 - def _arff_header(self,dict_features, classes):
214 """ 215 @para dict_features: The dictionary of features provided by the count method of L{Counter}. 216 @param classes: The classes considered by the classifier. 217 @return A string. The ARFF header. 218 """ 219 list_features = [] 220 arff_header = "@RELATION Polarity\n" 221 222 for feature in dict_features: 223 str_feature = self._long_id_feature_to_string(feature) 224 #print str_feature 225 _, ftc, _ = self._get_feature_configuration(str_feature) 226 if str_feature not in list_features: 227 list_features.append(str_feature) 228 arff_header+= "@ATTRIBUTE '"+str_feature.encode('utf-8').replace("'","\\'")+"' "+ftc.get_weka_data_type()+"\n" 229 230 arff_header+="@ATTRIBUTE 'class' {"+','.join(classes)+"}\n" 231 arff_header+="@DATA\n" 232 return arff_header
233
234 - def get_weighting_value(self,str_ftc,value):
235 """ 236 @param str_ftc. A string. The representation of an instance of L{FeatureTypeConfiguration} 237 @param value: An integer. 238 @return A weighted value according to the weighting factor employed by the current adapter 239 """ 240 241 if self._weighting_factor == self.BINARY_WEIGHTING_FACTOR: 242 return 1 243 if self._weighting_factor == self.TOTAL_WEIGHTING_FACTOR: 244 return value
245
246 - def _arff_data(self, dict_features, dict_feature_position, list_text_info):
247 """ 248 @para dict_features: The dictionary of features provided by the count method of L{Counter}. 249 @param classes: The classes considered by the classifier. 250 @return A string. The ARFF header. 251 """ 252 253 WEKA_RESERVED_WORDS = ['class'] 254 dict_features_file = defaultdict(defaultdict) 255 dict_instanceid_position = {} 256 position = 0 257 for feature in dict_features: 258 feature_id = feature[2] 259 feature_type = feature[0] 260 str_feature = self._long_id_feature_to_string(feature) 261 if str_feature in WEKA_RESERVED_WORDS: 262 continue 263 file_path = self._counter.file_id(feature_id) 264 value = dict_features[feature] 265 # print value, weight(value), self._weighting_factor 266 try: 267 dict_features_file[file_path][str_feature] = self.get_weighting_value(feature_type, 268 dict_features_file[file_path][str_feature]+ value) 269 #dict_features_file[file_path][str_feature]+= value 270 except KeyError: 271 dict_features_file[file_path][str_feature] = self.get_weighting_value(feature_type, 272 value) 273 #dict_features_file[file_path][str_feature]= value 274 275 arff_data = "" 276 277 for text_info in list_text_info: 278 open_symbol = "{" 279 close_symbol = "}" 280 281 textid = text_info.get_textid() 282 keys = dict_features_file[textid].keys() 283 keys = list(set(keys).intersection(set(dict_feature_position.keys()))) 284 keys.sort(key = lambda x: dict_feature_position[x]) 285 for feature_in_text in keys: 286 arff_data+=open_symbol+str(dict_feature_position[feature_in_text])+" "+str(dict_features_file[textid][feature_in_text])+"," 287 open_symbol = "" 288 arff_data+=open_symbol+str(dict_feature_position['class'])+" "+text_info.get_category()+close_symbol+"\n" 289 dict_instanceid_position[position] = textid 290 position+=1 291 292 return arff_data, dict_instanceid_position
293 294
295 - def _get_feature_configuration(self,str_name_feature):
296 """ 297 Given a long name of a feature it returns the L{FeatureType} 298 and the L{FeatureTypeConfiguration} 299 """ 300 301 aux = str_name_feature.rsplit(FeatureTypeConfiguration.DELIMITER_CONFIGURATION) 302 back_off_head = None 303 back_off_dependent = None 304 n_gram = None 305 feature_type_configuration = aux[1:(len(aux)-1)] 306 kwargs = {} 307 for configuration_element in feature_type_configuration: 308 309 if configuration_element.startswith(FeatureTypeConfiguration.HEAD_BACK_OFF_DELIMITER+"="): 310 back_off_head = configuration_element[len(FeatureTypeConfiguration.HEAD_BACK_OFF_DELIMITER+"="):] 311 312 if configuration_element.startswith(FeatureTypeConfiguration.DEPENDENT_BACK_OFF_DELIMITER+"="): 313 back_off_dependent = configuration_element[len(FeatureTypeConfiguration.DEPENDENT_BACK_OFF_DELIMITER+"="):] 314 315 if configuration_element.startswith(FeatureTypeConfiguration.N_GRAM_DELIMITER+"="): 316 n_gram = configuration_element[len(FeatureTypeConfiguration.N_GRAM_DELIMITER+"="):] 317 kwargs['n_gram'] = n_gram 318 319 if configuration_element.startswith(FeatureTypeConfiguration.N_GRAM_BACK_OFF_DELIMITER+"="): 320 n_gram_back_off = configuration_element[len(FeatureTypeConfiguration.N_GRAM_BACK_OFF_DELIMITER+"="):] 321 kwargs['n_gram_back_off'] = n_gram_back_off 322 323 if configuration_element.startswith(FeatureTypeConfiguration.ADD_DEPENDENCY_TYPE_DELIMITER+"="): 324 add_dependency_type = configuration_element[len(FeatureTypeConfiguration.ADD_DEPENDENCY_TYPE_DELIMITER+"="):] 325 kwargs['add_dependency_type'] = True if add_dependency_type == 'True' else False 326 327 if configuration_element.startswith(FeatureTypeConfiguration.SEMANTIC_PROPERTY_DELIMITER+"="): 328 semantic_property = configuration_element[len(FeatureTypeConfiguration.SEMANTIC_PROPERTY_DELIMITER+"="):] 329 kwargs['semantic_property'] = semantic_property 330 331 feature_type_configuration = FeatureTypeConfiguration(back_off_head, 332 back_off_dependent, 333 **{str(k): v for k, v in kwargs.items()}) 334 335 return aux[0],feature_type_configuration,aux[len(aux)-1]
336