Package miopia :: Package preprocessor :: Module PreProcessor
[hide private]
[frames] | no frames]

Source Code for Module miopia.preprocessor.PreProcessor

  1  #-*- coding: utf-8 -*- 
  2  ''' 
  3  @author: David Vilares Calvo 
  4  ''' 
  5  import codecs 
  6  import re 
  7   
  8  from miopia.preprocessor.PreProcessorI import PreProcessorI 
  9  from miopia.util.exceptions.LanguageNotSupportedException import LanguageNotSupportedException 
 10   
11 -class PreProcessor(PreProcessorI):
12 ''' 13 Tools for preprocessing a plain text 14 ''' 15 16 RE_CURRENCY_SYMBOL = r'[\$€£]' 17 RE_CURRENCY_CODE = r'[A-Z]{3}' 18 19 decimal_mark = '\.' 20 digit_grouping = ',' 21 22
23 - def _prepare_regexps(self):
24 r_decimals = r''+self.decimal_mark+r'[0-9]+' 25 r_ordinals = r'(?:st|nd|rd|th)' 26 self.r_numbers = r'-?[0-9]+(?:'+self.digit_grouping+r'[0-9]{3})*(?:'+r_decimals+')?'+r_ordinals+'?' 27 self.r_numbers_std = r'-?[0-9]+(?:\.[0-9]+)?' 28 self.r_time = r'[0-9]+[\.:][0-9]{2}h'
29 30
31 - def _convert_numbers(self,text):
32 """ 33 Removes digit grouping and spaces currency symbols and codes 34 """ 35 def __format_matched_numbers(match): 36 return " "+match.group(1).replace(self.digit_grouping,"").replace(self.decimal_mark,".")+" "
37 38 39 #TODO this line is splitting numbers from words 40 #text = re.sub('('+self.r_numbers+')', __format_matched_numbers, text) 41 42 43 #re-join hour 44 text = re.sub(r'([0-9]+) [\.:] ([0-9]{2}) h(?:\s|$)',r'\1.\2 ', text) 45 46 #re-join ordinals 47 #text = re.sub(r'([2-9]+) st\s') 48 49 text= re.sub('('+self.RE_CURRENCY_SYMBOL+')([0-9]+)',r'\1 \2', text) 50 text= re.sub('('+self.RE_CURRENCY_CODE+')([0-9]+)',r'\1 \2', text) 51 text= re.sub('([0-9]+)('+self.RE_CURRENCY_SYMBOL+')',r'\1 \2', text) 52 text= re.sub('([0-9]+)('+self.RE_CURRENCY_CODE+')',r'\1 \2', text) 53 54 return text
55
56 - def _is_number(self,text):
57 return re.match('^'+self.r_numbers_std+'$',(text)) != None
58
59 - def __init__(self, composite_words={}, abbreviations={}, lang='es'):
60 ''' 61 Constructor 62 @param composite_words: A composite words dictionary in the format {OrinalWord:JoinedWord} 63 @param abbreviations: An abbreviations dictionary in the format {abbreviation:OriginalWord} 64 ''' 65 self.lang = lang 66 67 if(lang == 'es'): 68 self.decimal_mark=',' 69 self.digit_grouping='\.' 70 elif(lang == 'en'): 71 self.decimal_mark='\.' 72 self.digit_grouping=',' 73 else: 74 raise LanguageNotSupportedException(lang) 75 76 self._prepare_regexps() 77 78 self._composite_words = composite_words 79 self._abbreviations = abbreviations 80 self._composite_words_patterns = self._get_composite_words_patterns(self._composite_words) 81 self._abbreviations_patterns = self._get_abbreviations_patterns(self._abbreviations) 82 self._special_abbreviations_patterns = self._get_special_abbreviations_patterns(self._abbreviations)
83 84
85 - def _get_composite_words_patterns(self,dict_composite_words):
86 dict_composite_words_patterns = {} 87 keys = self._composite_words.keys() 88 for key in keys: 89 pattern = re.compile('[ .,;:¡¿!?\[]+'.decode('utf-8')+key+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 90 +'[ .,;:¡¿!?\]]+'.decode('utf-8')+key+'$'+'|' 91 +'^'+key+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 92 +'\\b'+key+'\\b', re.IGNORECASE) 93 dict_composite_words_patterns[key] = pattern 94 return dict_composite_words_patterns
95 96 97
98 - def _get_abbreviations_patterns(self,dict_abbreviations):
99 #TODO: Bug if the line is exactly an abbreviation 100 dict_abbreviations_patterns = {} 101 abbreviations = self._abbreviations.keys() 102 for abbr in abbreviations: 103 try: 104 aux_abbr = abbr.replace('(','\(').replace(')','\)').replace('[','\[').replace(']','\]') 105 pattern = re.compile('[ .,;:¡¿!?\[]+'.decode('utf-8')+aux_abbr+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 106 +'[ .,;:¡¿!?\]]+'.decode('utf-8')+aux_abbr+'$'+'|' 107 +'^'+aux_abbr+'[ .,;:¡¿!?\]]+'.decode('utf-8'), re.IGNORECASE) 108 dict_abbreviations_patterns[abbr] = pattern 109 except: 110 pass 111 return dict_abbreviations_patterns
112 113
114 - def _get_special_abbreviations_patterns(self,dict_abbreviations):
115 dict_abbreviations_patterns = {} 116 abbreviations = self._abbreviations.keys() 117 for abbr in abbreviations: 118 if type(abbr) != type(u''): 119 abbr = abbr.decode('utf-8') 120 pattern = re.compile('[ .,;:¡¿!?\[]+'.decode('utf-8')+re.escape(abbr)+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 121 +'[ .,;:¡¿!?\]]+'.decode('utf-8')+re.escape(abbr)+'$'+'|' 122 +'^'+re.escape(abbr)+'[ .,;:¡¿!?\]]+'.decode('utf-8'), re.IGNORECASE) 123 dict_abbreviations_patterns[abbr] = pattern 124 return dict_abbreviations_patterns
125
126 - def _format_punkt(self,token):
127 """ 128 @param token: A token 129 @return: A modified token with separated punkt, if is not a number, otherwise returns the token 130 """ 131 lpunkt = [".",",",";",":","¡","¿"] 132 if not self._is_number(token): 133 if type(token) == str: 134 # Ignore errors even if the string is not proper UTF-8 or has 135 # broken marker bytes. 136 # Python built-in function unicode() can do this. 137 token = unicode(token, "utf-8", errors="ignore") 138 else: 139 # Assume the value object has proper __unicode__() method 140 token = unicode(token) 141 #print "token",type(token) 142 #try: 143 # token.decode('utf-8') 144 #except UnicodeDecodeError: 145 # token = token.encode('utf-8') 146 #except UnicodeEncodeError: 147 # pass 148 #print "token",type(token) 149 #token = token.encode('utf-8') 150 token = token.replace('“','\"').replace('”','\"') 151 #Special quotes normalization 152 #Decode is necessary because there are non ASCII chars 153 token = token.replace(u"“","\"").replace(u"”","\"") 154 #It is processed already well by the parser 155 if "..." in token: 156 return token 157 else: 158 for p in lpunkt: 159 if not token.endswith(p.decode("utf-8")): 160 token = token.replace(p.decode("utf-8"),p.decode("utf-8")+" ") 161 if '.' in token and not '.' == token: 162 token = token.replace("."," .") 163 return token 164 else: 165 return token
166 167 168
169 - def _format_composite_words(self,line):
170 """ 171 @param line: A line of a sentence 172 @return: A line where composite words are joined as one token 173 """ 174 keys = self._composite_words.keys() 175 keys_in_line = [key for key in keys if key in line] 176 for key in keys_in_line: 177 composed_expressions = self._composite_words_patterns[key].findall(line) 178 for c in composed_expressions: 179 line = line.replace(c,c.replace(key,self._composite_words.get(key))) 180 return line
181 182 183
184 - def _format_upper_abbreviations(self,line,a,abbr):
185 #Is the abbreviation but with different capitalisation 186 187 if abbr not in a: 188 return line.replace(a,a.lower()).replace(a.lower(),a.lower().replace(abbr,self._abbreviations.get(abbr))) 189 else: 190 return line.replace(a,a.replace(abbr,self._abbreviations.get(abbr))) 191 192
193 - def _format_abbreviations(self,line):
194 195 abbreviations = self._abbreviations.keys() 196 abbreviations_in_line = [abbr for abbr in abbreviations if abbr in line] 197 198 for abbr in abbreviations_in_line: 199 try: 200 abbreviations_found = set(self._abbreviations_patterns[abbr].findall(line)) 201 except KeyError: 202 abbreviations_found = set(self._special_abbreviations_patterns[abbr].findall(line)) 203 for abbreviation_found in abbreviations_found: 204 line = self._format_upper_abbreviations(line,abbreviation_found, 205 abbr) 206 return line
207 208 209
210 - def preprocess(self, text):
211 """ 212 @param text: A string 213 @return: A string preprocessed 214 """ 215 216 tokens = [] 217 218 aux = self._convert_numbers(text).split() 219 220 #For each word 221 for a in aux: 222 tokens.append(self._format_punkt(a)) 223 return self._format_composite_words(self._format_abbreviations((' '.join(tokens))))
224