1
2 '''
3 @author: David Vilares Calvo
4 '''
5 import codecs
6
7 -class TextPreparator(object):
8 """
9 Tools for preparing a text to MaltParser in CoNLL 2006 format
10 """
11
13 """
14 Constructor
15 """
16
17
18 - def prepare(self,output_path,tagged_sentences):
19 """
20 @param output_path: A path to the output conll file unparsed
21 @param tagged_sentences: A list of lists of tuples (word,L{TokenDependencyInfo}). Each
22 list of tuples is a tagged sentence.
23 """
24 outputFile = codecs.open(output_path,'w',encoding='utf-8')
25 for tagged_sentence in tagged_sentences:
26 outputFile.writelines(self._toConll2006(tagged_sentence))
27 outputFile.write('\n')
28
29
30
31 - def _toConll2006(self, taggedTokens):
32 """
33 @param taggedTokens: A list of tuples (word,L{TokenDependencyInfo}) of a sentence
34 @return: A String list in CoNLL 2006 format
35 """
36 identifier=1
37 conllLines = []
38 for token in taggedTokens:
39 conllLines.append( str(identifier)+'\t'+token[0]+'\t'+'_'+'\t'+token[1].get_cpostag()+'\t'
40 +token[1].get_postag()+'\t'+token[1].get_feats()+'\n' )
41 identifier = identifier + 1
42 return conllLines
43