Home | Trees | Indices | Help |
---|
|
1 #-*- coding: utf-8 -*- 2 ''' 3 @author: David Vilares Calvo 4 ''' 5 import codecs 6 import re 7 8 from miopia.preprocessor.PreProcessorI import PreProcessorI 9 from miopia.util.exceptions.LanguageNotSupportedException import LanguageNotSupportedException 1012 ''' 13 Tools for preprocessing a plain text 14 ''' 15 16 RE_CURRENCY_SYMBOL = r'[\$€£]' 17 RE_CURRENCY_CODE = r'[A-Z]{3}' 18 19 decimal_mark = '\.' 20 digit_grouping = ',' 21 2255 5824 r_decimals = r''+self.decimal_mark+r'[0-9]+' 25 r_ordinals = r'(?:st|nd|rd|th)' 26 self.r_numbers = r'-?[0-9]+(?:'+self.digit_grouping+r'[0-9]{3})*(?:'+r_decimals+')?'+r_ordinals+'?' 27 self.r_numbers_std = r'-?[0-9]+(?:\.[0-9]+)?' 28 self.r_time = r'[0-9]+[\.:][0-9]{2}h'29 3032 """ 33 Removes digit grouping and spaces currency symbols and codes 34 """ 35 def __format_matched_numbers(match): 36 return " "+match.group(1).replace(self.digit_grouping,"").replace(self.decimal_mark,".")+" "37 38 39 #TODO this line is splitting numbers from words 40 #text = re.sub('('+self.r_numbers+')', __format_matched_numbers, text) 41 42 43 #re-join hour 44 text = re.sub(r'([0-9]+) [\.:] ([0-9]{2}) h(?:\s|$)',r'\1.\2 ', text) 45 46 #re-join ordinals 47 #text = re.sub(r'([2-9]+) st\s') 48 49 text= re.sub('('+self.RE_CURRENCY_SYMBOL+')([0-9]+)',r'\1 \2', text) 50 text= re.sub('('+self.RE_CURRENCY_CODE+')([0-9]+)',r'\1 \2', text) 51 text= re.sub('([0-9]+)('+self.RE_CURRENCY_SYMBOL+')',r'\1 \2', text) 52 text= re.sub('([0-9]+)('+self.RE_CURRENCY_CODE+')',r'\1 \2', text) 53 54 return text60 ''' 61 Constructor 62 @param composite_words: A composite words dictionary in the format {OrinalWord:JoinedWord} 63 @param abbreviations: An abbreviations dictionary in the format {abbreviation:OriginalWord} 64 ''' 65 self.lang = lang 66 67 if(lang == 'es'): 68 self.decimal_mark=',' 69 self.digit_grouping='\.' 70 elif(lang == 'en'): 71 self.decimal_mark='\.' 72 self.digit_grouping=',' 73 else: 74 raise LanguageNotSupportedException(lang) 75 76 self._prepare_regexps() 77 78 self._composite_words = composite_words 79 self._abbreviations = abbreviations 80 self._composite_words_patterns = self._get_composite_words_patterns(self._composite_words) 81 self._abbreviations_patterns = self._get_abbreviations_patterns(self._abbreviations) 82 self._special_abbreviations_patterns = self._get_special_abbreviations_patterns(self._abbreviations)83 8486 dict_composite_words_patterns = {} 87 keys = self._composite_words.keys() 88 for key in keys: 89 pattern = re.compile('[ .,;:¡¿!?\[]+'.decode('utf-8')+key+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 90 +'[ .,;:¡¿!?\]]+'.decode('utf-8')+key+'$'+'|' 91 +'^'+key+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 92 +'\\b'+key+'\\b', re.IGNORECASE) 93 dict_composite_words_patterns[key] = pattern 94 return dict_composite_words_patterns95 96 9799 #TODO: Bug if the line is exactly an abbreviation 100 dict_abbreviations_patterns = {} 101 abbreviations = self._abbreviations.keys() 102 for abbr in abbreviations: 103 try: 104 aux_abbr = abbr.replace('(','\(').replace(')','\)').replace('[','\[').replace(']','\]') 105 pattern = re.compile('[ .,;:¡¿!?\[]+'.decode('utf-8')+aux_abbr+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 106 +'[ .,;:¡¿!?\]]+'.decode('utf-8')+aux_abbr+'$'+'|' 107 +'^'+aux_abbr+'[ .,;:¡¿!?\]]+'.decode('utf-8'), re.IGNORECASE) 108 dict_abbreviations_patterns[abbr] = pattern 109 except: 110 pass 111 return dict_abbreviations_patterns112 113115 dict_abbreviations_patterns = {} 116 abbreviations = self._abbreviations.keys() 117 for abbr in abbreviations: 118 if type(abbr) != type(u''): 119 abbr = abbr.decode('utf-8') 120 pattern = re.compile('[ .,;:¡¿!?\[]+'.decode('utf-8')+re.escape(abbr)+'[ .,;:¡¿!?\]]+'.decode('utf-8')+'|' 121 +'[ .,;:¡¿!?\]]+'.decode('utf-8')+re.escape(abbr)+'$'+'|' 122 +'^'+re.escape(abbr)+'[ .,;:¡¿!?\]]+'.decode('utf-8'), re.IGNORECASE) 123 dict_abbreviations_patterns[abbr] = pattern 124 return dict_abbreviations_patterns125127 """ 128 @param token: A token 129 @return: A modified token with separated punkt, if is not a number, otherwise returns the token 130 """ 131 lpunkt = [".",",",";",":","¡","¿"] 132 if not self._is_number(token): 133 if type(token) == str: 134 # Ignore errors even if the string is not proper UTF-8 or has 135 # broken marker bytes. 136 # Python built-in function unicode() can do this. 137 token = unicode(token, "utf-8", errors="ignore") 138 else: 139 # Assume the value object has proper __unicode__() method 140 token = unicode(token) 141 #print "token",type(token) 142 #try: 143 # token.decode('utf-8') 144 #except UnicodeDecodeError: 145 # token = token.encode('utf-8') 146 #except UnicodeEncodeError: 147 # pass 148 #print "token",type(token) 149 #token = token.encode('utf-8') 150 token = token.replace('“','\"').replace('”','\"') 151 #Special quotes normalization 152 #Decode is necessary because there are non ASCII chars 153 token = token.replace(u"“","\"").replace(u"”","\"") 154 #It is processed already well by the parser 155 if "..." in token: 156 return token 157 else: 158 for p in lpunkt: 159 if not token.endswith(p.decode("utf-8")): 160 token = token.replace(p.decode("utf-8"),p.decode("utf-8")+" ") 161 if '.' in token and not '.' == token: 162 token = token.replace("."," .") 163 return token 164 else: 165 return token166 167 168170 """ 171 @param line: A line of a sentence 172 @return: A line where composite words are joined as one token 173 """ 174 keys = self._composite_words.keys() 175 keys_in_line = [key for key in keys if key in line] 176 for key in keys_in_line: 177 composed_expressions = self._composite_words_patterns[key].findall(line) 178 for c in composed_expressions: 179 line = line.replace(c,c.replace(key,self._composite_words.get(key))) 180 return line181 182 183 185 #Is the abbreviation but with different capitalisation 186 187 if abbr not in a: 188 return line.replace(a,a.lower()).replace(a.lower(),a.lower().replace(abbr,self._abbreviations.get(abbr))) 189 else: 190 return line.replace(a,a.replace(abbr,self._abbreviations.get(abbr))) 191 192194 195 abbreviations = self._abbreviations.keys() 196 abbreviations_in_line = [abbr for abbr in abbreviations if abbr in line] 197 198 for abbr in abbreviations_in_line: 199 try: 200 abbreviations_found = set(self._abbreviations_patterns[abbr].findall(line)) 201 except KeyError: 202 abbreviations_found = set(self._special_abbreviations_patterns[abbr].findall(line)) 203 for abbreviation_found in abbreviations_found: 204 line = self._format_upper_abbreviations(line,abbreviation_found, 205 abbr) 206 return line207 208 209211 """ 212 @param text: A string 213 @return: A string preprocessed 214 """ 215 216 tokens = [] 217 218 aux = self._convert_numbers(text).split() 219 220 #For each word 221 for a in aux: 222 tokens.append(self._format_punkt(a)) 223 return self._format_composite_words(self._format_abbreviations((' '.join(tokens))))224
Home | Trees | Indices | Help |
---|
Generated by Epydoc 3.0.1 on Wed Oct 15 10:03:40 2014 | http://epydoc.sourceforge.net |