1
2 '''
3 @author: David Vilares Calvo
4 '''
5
6 import os
7 import codecs
8 import tempfile
9 from miopia.util.ConfigurationManager import ConfigurationManager
10 from miopia.parser.SentimentDependencyGraph import SentimentDependencyGraph
11 from miopia.preparator.TextPreparator import TextPreparator
12 from miopia.parser.TokenDependencyInfo import TokenDependencyInfo
13
15
16 __lang = 'es'
17 """
18 Tools for interacting with MaltParser 1.7* and obtaining L{SentimentDependencyGraph}
19 """
26
27
29 """
30 @param file_path: A path to a CoNLL 2006 file
31 @return: A list of L{SentimentDependencyGraph} which represent the parsed file.
32 """
33
34 sentences = self._simplify(file_path)
35 return [SentimentDependencyGraph(s) for s in sentences]
36
37
38
39 - def parse_dir_to_file(self,dir_path, list_of_tagged_sentences,
40 input_path="/tmp/parse_dir_to_file_unparsed.conll",
41 output_path="/tmp/parse_dir_to_file_parsed.conll"):
42 """
43 It parses a whole directory of plain texts into a single file
44 @param dir_path: The directory where the files and the sentences stored
45 in list_of_tagged_sentences will be written in CoNLL-2006 format.
46 @param aux_path: The path to the file where all plain sentences will be written
47 in CoNLL 2006 format before parsing them
48 @param list_of_tagged_sentences: A list of (text_id,[[(token,L{INfoTag})]].
49 """
50 list_id_and_number_of_sentences = []
51 dir_tagged_sentences =[]
52 for text_id, tagged_sentences in list_of_tagged_sentences:
53
54
55 if tagged_sentences == [[]]:
56 list_id_and_number_of_sentences.append((text_id,0))
57 else:
58 list_id_and_number_of_sentences.append((text_id,len(tagged_sentences)))
59 dir_tagged_sentences.extend(tagged_sentences)
60 self.parse_to_file(output_path, dir_tagged_sentences,
61 input_path)
62 sentences = open(output_path).read().split('\n\n')
63 i=0
64 for text_id,number_sentences in list_id_and_number_of_sentences:
65
66 if number_sentences == 0:
67 open(dir_path+os.sep+text_id,"w").write('')
68 else:
69 open(dir_path+os.sep+text_id,"w").write('\n\n'.join(sentences[i:i+number_sentences])+'\n\n')
70 i+=number_sentences
71
72
73
74 - def parse_to_file(self,output_path,tagged_sentences,aux_path=None):
75 """
76 @param output_path: The destination file.
77 @param tagged_sentences: [tagged_sentence] where tagged_sentences is a [(token,L{INfoTag})].
78 Use L{LexicalProcessor} to obtain them.
79 @param aux_path: The path to an auxiliary file to parse the sentences.
80 """
81 if aux_path==None:
82 aux_path = tempfile.NamedTemporaryFile(delete=False).name
83 self._preparator.prepare(aux_path,tagged_sentences)
84 self.parse_tagged_file(aux_path, output_path)
85 os.unlink(aux_path)
86
87
89 original_dir = os.getcwd()
90 c = ConfigurationManager(lang=self.__lang)
91 os.chdir(c.getParameter("path_maltparser_model"))
92 os.system("java -jar "+c.getParameter("path_maltparser")+" -c "+c.getParameter("maltparser_model")+
93 " -i "+tagged_file_name+" -o "+output_file_name+" -m parse")
94 os.chdir(original_dir)
95
96
97 - def parse_dir(self, list_of_tagged_sentences):
98 list_id_and_number_of_sentences = []
99 list_id_and_dependency_graphs = []
100 dir_tagged_sentences =[]
101 for text_id, tagged_sentences in list_of_tagged_sentences:
102 list_id_and_number_of_sentences.append((text_id,len(tagged_sentences)))
103 dir_tagged_sentences.extend(tagged_sentences)
104
105 graphs = self.parse(dir_tagged_sentences)
106 i=0
107 for text_id,number_sentences in list_id_and_number_of_sentences:
108 list_id_and_dependency_graphs.append((text_id, graphs[i:i+number_sentences]))
109 i+=number_sentences
110 return list_id_and_dependency_graphs
111
112 - def parse(self,tagged_sentences,temp_input=None,temp_output=None):
113 """
114 @param tagged_sentences: [tagged_sentence] where tagged_sentences is a [(token,L{INfoTag})].
115 Use L{LexicalProcessor} to obtain them.
116 @param input: Temporal file to save the unparsed text.
117 @param output: Temporal file to save the parsed text.
118 @return: A [L{SentimentDependencyGraph}]
119 """
120 if temp_input == None:
121 temp_input = tempfile.NamedTemporaryFile(delete=False).name
122
123 if temp_output == None:
124 temp_output = tempfile.NamedTemporaryFile(delete=False).name
125
126 self._preparator.prepare(temp_input,tagged_sentences)
127 self.parse_tagged_file(temp_input, temp_output)
128 sentences = self._simplify(temp_output)
129
130 os.unlink(temp_input)
131 os.unlink(temp_output)
132 return [SentimentDependencyGraph(s) for s in sentences]
133
134
135
137 """
138 Simplifies a CoNLL 2006 file. The output is used to build instances of L{SentimentDependencyGraph}
139 @param parsed_file: A path to a CoNLL 2006 file
140 @return A list of dictionaries. Each dictionary saves a sentence of the file. ID is the key
141 and the string FORM\tPOSTAG\tHEAD\tDEPREL is the value
142 """
143 co = codecs.open(parsed_file,encoding="utf-8")
144 lines = co.readlines()
145 sentence = {}
146 sentences = []
147
148 for l in lines:
149 if len(l) > 1:
150 columns = l.split('\t')
151
152 t = TokenDependencyInfo(columns[1],columns[4],int(columns[6]),columns[7])
153 sentence[int(columns[0])] = t
154
155 else:
156 sentences.append(sentence)
157 sentence = {}
158 co.close()
159 if sentence != {}:
160 sentences.append(sentence)
161
162 return self._format(self._reorganize(sentences))
163
164
165
167 """
168 @param sentence: An adversative sentence
169 @param identifier: ID of adversative clause
170 @return: A list of right brothers id's of the adversative clause
171 """
172 brothers = []
173 father = sentence[identifier].get_head()
174
175 for key in sentence.keys():
176 if sentence[key].get_head() == father and key >= identifier:
177 brothers.append(key)
178 return brothers
179
180
181
183 """
184 Reorganizes the output_parsed CoNLL 2006 file to simplify the subordinating sentences
185 @param sentences: A list of dictionaries. Each dictionaries is a sentence in CoNLL 2006
186 representation. ID is the key and and the string FORM\tPOSTAG\tHEAD\tDEPREL is the value.
187 """
188 for sentence in sentences:
189 for key in sentence.keys():
190
191 if self._is_symbolic_url(sentence[key]):
192 sentence = self._reorganize_symbolic_url(sentence, key)
193 if self._is_emoticon(sentence[key]):
194 sentence = self._reorganize_emoticon(sentence, key)
195 if self._is_reorganizable_adversative(sentence[key]):
196 sentence = self._reorganize_adversative(sentence,key)
197 return sentences
198
199
201 """
202 @param token: A L{TokenDependencyInfo} instance
203 @return True is token form equals to 'SymbolicURL', False otherwise
204 """
205 return token.get_form() == 'SymbolicURL'
206
207
209 """
210 @precondition: The L{TokenDependencyInfo} sentence[key] must be a symbolic url
211 @param sentence: A dictionary of L{TokenDependencyInfo}. Represents a sentence
212 in CoNLL-2006. ID column is the key.
213 @param key: ID of the symbolic url token
214 @return A modified dictionary with modified information to the symbolic url token
215 """
216 sentence[key].set_deprel("art_rel_symbolicurl")
217 sentence[key].set_finetag("symbolicurl:")
218 return sentence
219
220
221
223 """
224 @param token A L{TokenDependencyInfo} instance
225 @return: True if token form is in set (['Emoticon-Negative','Emoticon-Positive',
226 'Extremely-Emoticon-Positive',
227 'Extremely-Emoticon-Negative',
228 'Neutral']), False otherwise
229 """
230 set_emoticons = set(['Emoticon-Negative','Emoticon-Positive',
231 'Extremely-Emoticon-Positive',
232 'Extremely-Emoticon-Negative',
233 'Neutral'])
234
235 return token.get_form() in set_emoticons
236
237
238
240 """
241 @precondition: The L{TokenDependencyInfo} sentence[key] must be an emoticon
242 @param sentence: A dictionary of L{TokenDependencyInfo}. Represents a sentence
243 in CoNLL-2006. ID column is the key.
244 @param key: ID of the emoticon token
245 @return A modified dictionary with modified information to the symbolic emoticon token
246 """
247 sentence[key].set_deprel("art_rel_emoticon")
248 sentence[key].set_finetag("emoticon:")
249 return sentence
250
251
252
254 """
255 @param token: A L{TokenDependencyInfo} instance
256 @return: True if token can be reorganized ('pero','sino','mientras','mientras_que','sino_que'), False otherwise
257 """
258
259 ladversatives = ['pero','sino','mientras','mientras_que','sino_que']
260
261 return (token.get_finetag() == "c:postype_coordinating"
262 and (token.get_form() in ladversatives)
263 and token.get_head() != 0 and token.get_deprel() == "coord")
264
265
266
268 """
269 @precondition: Adversative clause must be reorganizable
270 @param sentence: A dictionary of a sentence in CoNLL 2006. ID is the key and and the string FORM\tPOSTAG\tHEAD\tDEPREL is the value.
271 @param key: ID of an adversative clause
272 @return:
273 """
274 head = sentence[key].get_head()
275 artificial_id = len(sentence)+1
276 form = sentence[key].get_form()
277 artificial_node = TokenDependencyInfo("[]","art_adversative:"+self._type_of_adversative(form)+"@"+str(key),
278 sentence[head].get_head(),"art_rel_adversative")
279 sentence[artificial_id] = artificial_node
280 sentence[head].set_head(artificial_id)
281 right_brothers = self._right_brothers(sentence,key)
282 for brother in right_brothers:
283 sentence[brother].set_head(artificial_id)
284 return sentence
285
286
288 """
289 @precondition: form must be in {'pero','sino','mientras','mientras_que','sino_que'}
290 @param form: An adversative clause
291 @return: 'restrict' if the clause is restrictive, 'exclude' otherwise
292 """
293 if form in ['pero','mientras','mientras_que']:
294 return 'restrict'
295 else:
296 return 'exclude'
297
298
314
316
317 """
318 MaltParser Wrapper
319 """
320 - def __init__(self, parser_bin, model_dir, model_name):
321 """
322 Constructor
323 """
324 self._preparator = TextPreparator()
325 self._parser_bin = parser_bin
326 self._model_dir = model_dir
327 self._model_name = model_name
328
329
331 original_dir = os.getcwd()
332 os.chdir(self._model_dir)
333 command = "java -jar %s -c %s -i %s -o %s -m parse"%(
334 self._parser_bin, self._model_name,
335 tagged_file_name, output_file_name)
336 os.system(command)
337 os.chdir(original_dir)
338
339
341
342 """
343 Zpar wrapper
344 """
345 - def __init__(self, parser_bin, model_path):
346 """
347 Constructor
348 """
349 self._parser_bin = parser_bin
350 self._model_path = model_path
351 self._preparator = TextPreparator()
352
354 os.system("%s -c %s %s %s"%(self._parser_bin,tagged_file_name, output_file_name, self._model_path))
355