Package miopia :: Package preparator :: Module LexicalProcessor
[hide private]
[frames] | no frames]

Source Code for Module miopia.preparator.LexicalProcessor

  1  ''' 
  2  @author: David Vilares Calvo 
  3  ''' 
  4   
  5  import re 
  6  import codecs 
  7  from miopia.preparator.LexicalSentimentInfo import LexicalSentimentInfo, LexicalValenceShifter 
  8  from miopia.util.ConfigurationManager import ConfigurationManager 
  9  import xml.etree.ElementTree as ET 
 10   
 11   
12 -class LexicalProcessor(object):
13 ''' 14 classdocs 15 ''' 16
17 - def __init__(self, sentence_tokenizer, tokenizer, tagger, 18 vocabulary_set=[]):
19 ''' 20 Constructor 21 @param sentence_tokenizer: Use tokenizers/punkt/spanish.pickle from nltk_data 22 @param tokenizer: An instance of L{nltk.tokenize.punkt import PunktWordTokenizer} 23 @param tagger: Use spanish_brill.pickle (after unserialize) included in this package 24 @param vocabulary_set: A Python set with the vocabulary 25 ''' 26 self._sentence_tokenizer = sentence_tokenizer 27 self._tokenizer = tokenizer 28 self._tagger = tagger 29 self._vocabulary_set = vocabulary_set 30 ''' 31 if vocabulary_set is None: 32 self._vocabulary_set = self._build_vocabulary_set(ConfigurationManager().getParameter("path_vocabulary_set")) 33 else: 34 self._vocabulary_set = vocabulary_set 35 '''
36 37
38 - def get_vocabulary_set(self):
39 return self._vocabulary_set
40 41
42 - def _build_vocabulary_set(self,path_to_file):
43 """ 44 @param path_to_file: A path to the file with the vocabulary of words. 45 46 Example of the structure of a vocabulary file: 47 Word1 48 Word2 49 ... 50 WordN 51 """ 52 return set( codecs.open(path_to_file,encoding='utf-8').read().split())
53 54 55
56 - def extract_sentences(self,text):
57 """ 58 @param text: A String 59 @return: A list of strings with the sentences of the text 60 """ 61 return self._sentence_tokenizer.tokenize(text)
62 63 64
65 - def _replications(self,token):
66 """ 67 @param token: A String 68 @return: A list with the repeated chars in the token 69 """ 70 #Numbers can have replicated digits but there arent't 'replications' 71 try: 72 float(token) 73 return [] 74 except: 75 matcher = re.compile(r'(.)\1*') 76 replications = [match.group() for match in matcher.finditer(token) 77 if len(match.group())>=2] 78 return replications
79 80 81
82 - def _eliminate_replications(self, token, replications):
83 """ 84 @param token: A String 85 @param replications: A list with the replicated chars of a token 86 @return: A valid word in the vocabulary if it exits, otherwise returns 87 the original word. 88 """ 89 combinations = [] 90 c = "" 91 92 if token not in self._vocabulary_set and token.lower() not in self._vocabulary_set: 93 #posible combinations e.g. (xx,yy) (xx,y) (x,yy) 94 for r in replications: 95 #range(1,3) in spanish only two repeated chars are supported. 96 for i in reversed(range(1,3)): 97 c = token.replace(r,r[:i]) 98 #lower used to match first letter capitalised words 99 if (c in self._vocabulary_set 100 or c.lower() in self._vocabulary_set): 101 combinations.append(c) 102 for r2 in replications: 103 if r2 is not r: 104 for j in reversed(range(1,3)): 105 if ((c.replace(r2,r2[:j]) in self._vocabulary_set 106 or c.replace(r2,r2[:j]).lower() in self._vocabulary_set) 107 and c.replace(r2,r2[:j]) not in combinations): 108 combinations.append(c.replace(r2,r2[:j])) 109 if combinations == []: 110 return token 111 else: 112 return sorted(combinations,key=len)[0]
113 114
115 - def _is_upper_intesification(self,token):
116 """ 117 @param token: A String 118 @return: True if words is a complete capitalized word, False otherwise 119 """ 120 #Not considering symbols like ',' '.' or similars, we only take words 121 return (token.isupper() and token not in self._vocabulary_set)
122 123 124
125 - def _is_intensifier_replication(self,replications):
126 """ 127 @param replications: A list with the replicated chars of a token 128 @return: True if there are three or more replicated chars, False otherwise 129 """ 130 longest_replication = sorted(replications,key=len,reverse=True)[0] 131 return ( len(longest_replication) >= 3 and 132 longest_replication != '...') #It is a special case
133 134 135 136
137 - def extract_tokens(self,sentences):
138 """ 139 @sentences: A list of the sentences of a given text 140 @return: A list of lists with the token tokens and 141 a L{LexicalSentimentInfo} instance with the lexical 142 sentiment information for the text. L{LexicalSentimentInfo} 143 is None If no lexical sentiment info is found. 144 """ 145 lsi = None 146 sentences_tokenized = [] 147 processed_tokens = [] 148 id_sentence = 1 149 150 for s in sentences: 151 tokens = self._tokenizer.tokenize(s) 152 id_token = 1 153 for t in tokens: 154 155 if self._is_upper_intesification(t): 156 if lsi is None: lsi = LexicalSentimentInfo() 157 t = t.lower() 158 lsi.add_lexical_valence_shifter(id_sentence, 159 id_token, 160 LexicalValenceShifter.CAPS) 161 162 replications = self._replications(t) 163 if replications != []: 164 #Not considering as intensifying the case of 2 replicated chars? 165 normalized_t = self._eliminate_replications(t, replications) 166 t = normalized_t 167 if self._is_intensifier_replication(replications): 168 if lsi is None: lsi = LexicalSentimentInfo() 169 lsi.add_lexical_valence_shifter(id_sentence, 170 id_token, 171 LexicalValenceShifter.REPLICATION) 172 id_token += 1 173 processed_tokens.append(t) 174 sentences_tokenized.append(processed_tokens) 175 processed_tokens =[] 176 id_sentence += 1 177 return (sentences_tokenized, lsi)
178 179 180
181 - def extract_tags(self,tokenized_sentences):
182 """ 183 @param tokenized_sentences: A list of lists of tokens 184 @return A list of tagged sentences. Each tagged sentence is a list of tuples (token,L{INfoTag}) 185 """ 186 return [self._tagger.tag(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
187 188 189
190 - def create_lexical_info_XML(self,dict_of_lsi,path_dest):
191 """ 192 Writes in path_dest a XML representation of the L{LexicalSentimentInfo} 193 of the file 194 @param path_dest: A path to the destination XML file 195 @param dict_of_lsi: A dictionary of L{LexicalSentimentInfo} 196 """ 197 198 def create_element(parent,element_name,text): 199 element = ET.SubElement(parent, element_name) 200 element.text = text 201 return element
202 203 204 files = ET.Element('files') 205 for lsi_file in dict_of_lsi.keys(): 206 if dict_of_lsi[lsi_file] is not None: 207 file_element = ET.SubElement(files, 'file') 208 create_element(file_element, 'fileid',lsi_file) 209 sentences = ET.SubElement(file_element, 'sentences') 210 for sentence_key in dict_of_lsi[lsi_file].get_dict(): 211 sentence = ET.SubElement(sentences,'sentence') 212 create_element(sentence,'sentenceid',str(sentence_key)) 213 words = ET.SubElement(sentence,'words') 214 word_keys = sentence_key, dict_of_lsi[lsi_file].get_dict()[sentence_key] 215 for item in word_keys[1].items(): 216 word = ET.SubElement(words,'word') 217 create_element(word,'wordid',str(item[0])) 218 create_element(word,'phenomena',','.join(item[1])) 219 f = open(path_dest,'w') 220 ET.ElementTree(files).write(path_dest, 'UTF-8') 221 f.close() 222 223 224
225 - def read_lexical_info_XML(self,input_path):
226 """ 227 @param input_path: A path to A XML file with the lexical sentiment 228 info needed to build a dict of L{LexicalSentimentInfo} 229 @return: A dictionary of L{LexicalSentimentInfo} 230 """ 231 tree = ET.parse(input_path) 232 root = tree.getroot() 233 dict_of_lsi = {} 234 files = root.findall('file') 235 236 switch = {'CAPS': LexicalValenceShifter.CAPS, 237 'REPLICATION': LexicalValenceShifter.REPLICATION} 238 239 for f in files: 240 sentences = f.find('sentences').findall('sentence') 241 lsi = LexicalSentimentInfo() 242 for s in sentences: 243 words = s.find('words').findall('word') 244 for w in words: 245 phenomena = w.find('phenomena').text.split(',') 246 for phenomenon in phenomena: 247 lsi.add_lexical_valence_shifter(int(s.find('sentenceid').text), 248 int(w.find('wordid').text), 249 switch[phenomenon]) 250 dict_of_lsi[f.find('fileid').text] = lsi 251 return dict_of_lsi
252