Package miopia :: Package parser :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module miopia.parser.Parser

  1  #-*- coding: utf-8 -*- 
  2  ''' 
  3  @author: David Vilares Calvo 
  4  ''' 
  5   
  6  import os 
  7  import codecs 
  8  import tempfile 
  9  from miopia.util.ConfigurationManager import ConfigurationManager 
 10  from miopia.parser.SentimentDependencyGraph import SentimentDependencyGraph 
 11  from miopia.preparator.TextPreparator import TextPreparator 
 12  from miopia.parser.TokenDependencyInfo import TokenDependencyInfo 
 13   
14 -class Parser(object):
15 16 __lang = 'es' 17 """ 18 Tools for interacting with MaltParser 1.7* and obtaining L{SentimentDependencyGraph} 19 """
20 - def __init__(self, lang = 'es'):
21 """ 22 Constructor 23 """ 24 self._preparator = TextPreparator() 25 self.__lang = lang
26 27
28 - def parse_from_conll(self,file_path):
29 """ 30 @param file_path: A path to a CoNLL 2006 file 31 @return: A list of L{SentimentDependencyGraph} which represent the parsed file. 32 """ 33 #TODO: Better simplify in parse_to_file, not here 34 sentences = self._simplify(file_path) 35 return [SentimentDependencyGraph(s) for s in sentences]
36 37 38
39 - def parse_dir_to_file(self,dir_path, list_of_tagged_sentences, 40 input_path="/tmp/parse_dir_to_file_unparsed.conll", 41 output_path="/tmp/parse_dir_to_file_parsed.conll"):
42 """ 43 It parses a whole directory of plain texts into a single file 44 @param dir_path: The directory where the files and the sentences stored 45 in list_of_tagged_sentences will be written in CoNLL-2006 format. 46 @param aux_path: The path to the file where all plain sentences will be written 47 in CoNLL 2006 format before parsing them 48 @param list_of_tagged_sentences: A list of (text_id,[[(token,L{INfoTag})]]. 49 """ 50 list_id_and_number_of_sentences = [] #ID: number of sentences of the file (int) 51 dir_tagged_sentences =[] 52 for text_id, tagged_sentences in list_of_tagged_sentences: 53 #print text_id, len(tagged_sentences) 54 55 if tagged_sentences == [[]]: 56 list_id_and_number_of_sentences.append((text_id,0)) 57 else: 58 list_id_and_number_of_sentences.append((text_id,len(tagged_sentences))) 59 dir_tagged_sentences.extend(tagged_sentences) 60 self.parse_to_file(output_path, dir_tagged_sentences, 61 input_path) 62 sentences = open(output_path).read().split('\n\n') 63 i=0 64 for text_id,number_sentences in list_id_and_number_of_sentences: 65 66 if number_sentences == 0: 67 open(dir_path+os.sep+text_id,"w").write('') 68 else: 69 open(dir_path+os.sep+text_id,"w").write('\n\n'.join(sentences[i:i+number_sentences])+'\n\n') 70 i+=number_sentences
71 72 73
74 - def parse_to_file(self,output_path,tagged_sentences,aux_path=None):
75 """ 76 @param output_path: The destination file. 77 @param tagged_sentences: [tagged_sentence] where tagged_sentences is a [(token,L{INfoTag})]. 78 Use L{LexicalProcessor} to obtain them. 79 @param aux_path: The path to an auxiliary file to parse the sentences. 80 """ 81 if aux_path==None: 82 aux_path = tempfile.NamedTemporaryFile(delete=False).name 83 self._preparator.prepare(aux_path,tagged_sentences) 84 self.parse_tagged_file(aux_path, output_path) 85 os.unlink(aux_path)
86 87
88 - def parse_tagged_file(self, tagged_file_name, output_file_name):
89 original_dir = os.getcwd() 90 c = ConfigurationManager(lang=self.__lang) 91 os.chdir(c.getParameter("path_maltparser_model")) 92 os.system("java -jar "+c.getParameter("path_maltparser")+" -c "+c.getParameter("maltparser_model")+ 93 " -i "+tagged_file_name+" -o "+output_file_name+" -m parse") 94 os.chdir(original_dir)
95 96
97 - def parse_dir(self, list_of_tagged_sentences):
98 list_id_and_number_of_sentences = [] #ID: number of sentences of the file (int) 99 list_id_and_dependency_graphs = [] 100 dir_tagged_sentences =[] 101 for text_id, tagged_sentences in list_of_tagged_sentences: 102 list_id_and_number_of_sentences.append((text_id,len(tagged_sentences))) 103 dir_tagged_sentences.extend(tagged_sentences) 104 105 graphs = self.parse(dir_tagged_sentences) 106 i=0 107 for text_id,number_sentences in list_id_and_number_of_sentences: 108 list_id_and_dependency_graphs.append((text_id, graphs[i:i+number_sentences])) 109 i+=number_sentences 110 return list_id_and_dependency_graphs
111
112 - def parse(self,tagged_sentences,temp_input=None,temp_output=None):
113 """ 114 @param tagged_sentences: [tagged_sentence] where tagged_sentences is a [(token,L{INfoTag})]. 115 Use L{LexicalProcessor} to obtain them. 116 @param input: Temporal file to save the unparsed text. 117 @param output: Temporal file to save the parsed text. 118 @return: A [L{SentimentDependencyGraph}] 119 """ 120 if temp_input == None: 121 temp_input = tempfile.NamedTemporaryFile(delete=False).name 122 123 if temp_output == None: 124 temp_output = tempfile.NamedTemporaryFile(delete=False).name 125 126 self._preparator.prepare(temp_input,tagged_sentences) 127 self.parse_tagged_file(temp_input, temp_output) 128 sentences = self._simplify(temp_output) 129 130 os.unlink(temp_input) 131 os.unlink(temp_output) 132 return [SentimentDependencyGraph(s) for s in sentences]
133 134 135
136 - def _simplify(self,parsed_file):
137 """ 138 Simplifies a CoNLL 2006 file. The output is used to build instances of L{SentimentDependencyGraph} 139 @param parsed_file: A path to a CoNLL 2006 file 140 @return A list of dictionaries. Each dictionary saves a sentence of the file. ID is the key 141 and the string FORM\tPOSTAG\tHEAD\tDEPREL is the value 142 """ 143 co = codecs.open(parsed_file,encoding="utf-8") 144 lines = co.readlines() 145 sentence = {} 146 sentences = [] 147 148 for l in lines: 149 if len(l) > 1: 150 columns = l.split('\t') 151 # print columns, 152 t = TokenDependencyInfo(columns[1],columns[4],int(columns[6]),columns[7]) 153 sentence[int(columns[0])] = t 154 # print sentence 155 else: 156 sentences.append(sentence) 157 sentence = {} 158 co.close() 159 if sentence != {}: 160 sentences.append(sentence) 161 # print sentences 162 return self._format(self._reorganize(sentences))
163 164 165
166 - def _right_brothers(self,sentence,identifier):
167 """ 168 @param sentence: An adversative sentence 169 @param identifier: ID of adversative clause 170 @return: A list of right brothers id's of the adversative clause 171 """ 172 brothers = [] 173 father = sentence[identifier].get_head() 174 175 for key in sentence.keys(): 176 if sentence[key].get_head() == father and key >= identifier: 177 brothers.append(key) 178 return brothers
179 180 181
182 - def _reorganize(self,sentences):
183 """ 184 Reorganizes the output_parsed CoNLL 2006 file to simplify the subordinating sentences 185 @param sentences: A list of dictionaries. Each dictionaries is a sentence in CoNLL 2006 186 representation. ID is the key and and the string FORM\tPOSTAG\tHEAD\tDEPREL is the value. 187 """ 188 for sentence in sentences: 189 for key in sentence.keys(): 190 191 if self._is_symbolic_url(sentence[key]): 192 sentence = self._reorganize_symbolic_url(sentence, key) 193 if self._is_emoticon(sentence[key]): 194 sentence = self._reorganize_emoticon(sentence, key) 195 if self._is_reorganizable_adversative(sentence[key]): 196 sentence = self._reorganize_adversative(sentence,key) 197 return sentences 198 199
200 - def _is_symbolic_url(self,token):
201 """ 202 @param token: A L{TokenDependencyInfo} instance 203 @return True is token form equals to 'SymbolicURL', False otherwise 204 """ 205 return token.get_form() == 'SymbolicURL'
206 207
208 - def _reorganize_symbolic_url(self,sentence,key):
209 """ 210 @precondition: The L{TokenDependencyInfo} sentence[key] must be a symbolic url 211 @param sentence: A dictionary of L{TokenDependencyInfo}. Represents a sentence 212 in CoNLL-2006. ID column is the key. 213 @param key: ID of the symbolic url token 214 @return A modified dictionary with modified information to the symbolic url token 215 """ 216 sentence[key].set_deprel("art_rel_symbolicurl") 217 sentence[key].set_finetag("symbolicurl:") 218 return sentence
219 220 221
222 - def _is_emoticon(self,token):
223 """ 224 @param token A L{TokenDependencyInfo} instance 225 @return: True if token form is in set (['Emoticon-Negative','Emoticon-Positive', 226 'Extremely-Emoticon-Positive', 227 'Extremely-Emoticon-Negative', 228 'Neutral']), False otherwise 229 """ 230 set_emoticons = set(['Emoticon-Negative','Emoticon-Positive', 231 'Extremely-Emoticon-Positive', 232 'Extremely-Emoticon-Negative', 233 'Neutral']) 234 235 return token.get_form() in set_emoticons
236 237 238
239 - def _reorganize_emoticon(self,sentence,key):
240 """ 241 @precondition: The L{TokenDependencyInfo} sentence[key] must be an emoticon 242 @param sentence: A dictionary of L{TokenDependencyInfo}. Represents a sentence 243 in CoNLL-2006. ID column is the key. 244 @param key: ID of the emoticon token 245 @return A modified dictionary with modified information to the symbolic emoticon token 246 """ 247 sentence[key].set_deprel("art_rel_emoticon") 248 sentence[key].set_finetag("emoticon:") 249 return sentence
250 251 252
253 - def _is_reorganizable_adversative(self,token):
254 """ 255 @param token: A L{TokenDependencyInfo} instance 256 @return: True if token can be reorganized ('pero','sino','mientras','mientras_que','sino_que'), False otherwise 257 """ 258 259 ladversatives = ['pero','sino','mientras','mientras_que','sino_que'] 260 261 return (token.get_finetag() == "c:postype_coordinating" 262 and (token.get_form() in ladversatives) 263 and token.get_head() != 0 and token.get_deprel() == "coord")
264 265 266
267 - def _reorganize_adversative(self,sentence,key):
268 """ 269 @precondition: Adversative clause must be reorganizable 270 @param sentence: A dictionary of a sentence in CoNLL 2006. ID is the key and and the string FORM\tPOSTAG\tHEAD\tDEPREL is the value. 271 @param key: ID of an adversative clause 272 @return: 273 """ 274 head = sentence[key].get_head() 275 artificial_id = len(sentence)+1 276 form = sentence[key].get_form() 277 artificial_node = TokenDependencyInfo("[]","art_adversative:"+self._type_of_adversative(form)+"@"+str(key), 278 sentence[head].get_head(),"art_rel_adversative") 279 sentence[artificial_id] = artificial_node 280 sentence[head].set_head(artificial_id) 281 right_brothers = self._right_brothers(sentence,key) 282 for brother in right_brothers: 283 sentence[brother].set_head(artificial_id) 284 return sentence
285 286
287 - def _type_of_adversative(self,form):
288 """ 289 @precondition: form must be in {'pero','sino','mientras','mientras_que','sino_que'} 290 @param form: An adversative clause 291 @return: 'restrict' if the clause is restrictive, 'exclude' otherwise 292 """ 293 if form in ['pero','mientras','mientras_que']: 294 return 'restrict' 295 else: 296 return 'exclude'
297 298
299 - def _format(self,sentences):
300 """ 301 Prepares a text to get a DependencyGraph instance 302 """ 303 data_string = "" 304 formatted_sentences = [] 305 306 for sentence in sentences: 307 for key in sentence.keys(): 308 token = sentence[key] 309 data_string = data_string+token.get_form()+'\t'+token.get_finetag() \ 310 +'\t'+str(token.get_head())+'\t'+token.get_deprel()+'\n' 311 formatted_sentences.append(data_string) 312 data_string = "" 313 return formatted_sentences
314
315 -class MaltParser(Parser):
316 317 """ 318 MaltParser Wrapper 319 """
320 - def __init__(self, parser_bin, model_dir, model_name):
321 """ 322 Constructor 323 """ 324 self._preparator = TextPreparator() 325 self._parser_bin = parser_bin 326 self._model_dir = model_dir 327 self._model_name = model_name
328 329
330 - def parse_tagged_file(self, tagged_file_name, output_file_name):
331 original_dir = os.getcwd() 332 os.chdir(self._model_dir) 333 command = "java -jar %s -c %s -i %s -o %s -m parse"%( 334 self._parser_bin, self._model_name, 335 tagged_file_name, output_file_name) 336 os.system(command) 337 os.chdir(original_dir)
338 339
340 -class ZparParser(Parser):
341 342 """ 343 Zpar wrapper 344 """
345 - def __init__(self, parser_bin, model_path):
346 """ 347 Constructor 348 """ 349 self._parser_bin = parser_bin 350 self._model_path = model_path 351 self._preparator = TextPreparator()
352
353 - def parse_tagged_file(self, tagged_file_name, output_file_name):
354 os.system("%s -c %s %s %s"%(self._parser_bin,tagged_file_name, output_file_name, self._model_path))
355