1
2 '''
3 @author: David Vilares Calvo
4 '''
5
6 import pickle
7
9 """
10 An abstract wrapper to build NLTK-based taggers
11 """
12
14 '''
15 @param l: A string.
16 @return: l without any accents.
17 '''
18
19 if 'á' in l:
20 l = l.replace('á','a')
21 if 'é' in l:
22 l = l.replace('é','e')
23 if 'í' in l:
24 l = l.replace('í','i')
25 if 'ó' in l:
26 l = l.replace('ó','o')
27 if 'ú' in l:
28 l = l.replace('ú','u')
29 return l
30
31
33 '''
34 @precondition: train_file had to be created previously
35 @param finetag: If true, get_training_set will return fine tags, not coarse tags
36 @param train_file: A path to a training set in CoNLL-X format
37 @return: A list of [(word,tag)]
38 '''
39 t = []
40 inputfile = open(train_file,'r')
41 lines = inputfile.readlines()
42 abs=[]
43
44
45 for l in lines:
46 columns = l.split('\t')
47 if len(columns) <= 1:
48 t.append(abs)
49
50 abs=[]
51
52 else:
53 if finetag:
54 wordtag = (columns[1],columns[4])
55
56 else:
57 wordtag = (columns[1],columns[3])
58
59 abs.append((wordtag))
60
61 return t
62
63
65 '''
66 @precondition: goldFile had to be created previously
67 @param finetag: If true, get_training_set will return fine tags, not coarse tags
68 @param gold_file: A path to a test set in CoNLL-X format
69 @return: A list of [(word,tag)]
70 '''
71 gold = []
72 input_file = open(gold_file,'r')
73 lines = input_file.readlines()
74 tagged_word = []
75 tagged_word_d = []
76
77 for l in lines:
78 columns = l.split('\t')
79 if len(columns) <= 1:
80 gold.append(tagged_word)
81 tagged_word=[]
82 else:
83 if finetag:
84 word_tag = (columns[1],columns[4])
85 else:
86 word_tag = (columns[1],columns[3])
87 tagged_word.append((word_tag))
88 return gold
89