Package miopia :: Package tagger :: Module AbstractTagger
[hide private]
[frames] | no frames]

Source Code for Module miopia.tagger.AbstractTagger

 1  #-*- coding: utf-8 -*-  
 2  ''' 
 3  @author: David Vilares Calvo 
 4  ''' 
 5   
 6  import pickle 
 7   
8 -class AbstractTagger(object):
9 """ 10 An abstract wrapper to build NLTK-based taggers 11 """ 12
13 - def dis_accentuate(self,l):
14 ''' 15 @param l: A string. 16 @return: l without any accents. 17 ''' 18 19 if 'á' in l: 20 l = l.replace('á','a') 21 if 'é' in l: 22 l = l.replace('é','e') 23 if 'í' in l: 24 l = l.replace('í','i') 25 if 'ó' in l: 26 l = l.replace('ó','o') 27 if 'ú' in l: 28 l = l.replace('ú','u') 29 return l
30 31
32 - def get_training_set(self,train_file, finetag=True):
33 ''' 34 @precondition: train_file had to be created previously 35 @param finetag: If true, get_training_set will return fine tags, not coarse tags 36 @param train_file: A path to a training set in CoNLL-X format 37 @return: A list of [(word,tag)] 38 ''' 39 t = [] 40 inputfile = open(train_file,'r') 41 lines = inputfile.readlines() 42 abs=[] 43 # abs_desacentuada = [] 44 45 for l in lines: 46 columns = l.split('\t') 47 if len(columns) <= 1: 48 t.append(abs) 49 # t.append(abs_desacentuada) 50 abs=[] 51 # abs_desacentuada=[] 52 else: 53 if finetag: 54 wordtag = (columns[1],columns[4]) 55 # wordtag_d = (self.dis_accentuate(columns[1]),columns[4]) 56 else: 57 wordtag = (columns[1],columns[3]) 58 # wordtag_d = (self.dis_accentuate(columns[1]),columns[3]) 59 abs.append((wordtag)) 60 # abs_desacentuada.append(wordtag_d) 61 return t
62 63
64 - def get_gold_set(self,gold_file, finetag=True):
65 ''' 66 @precondition: goldFile had to be created previously 67 @param finetag: If true, get_training_set will return fine tags, not coarse tags 68 @param gold_file: A path to a test set in CoNLL-X format 69 @return: A list of [(word,tag)] 70 ''' 71 gold = [] 72 input_file = open(gold_file,'r') 73 lines = input_file.readlines() 74 tagged_word = [] 75 tagged_word_d = [] 76 77 for l in lines: 78 columns = l.split('\t') 79 if len(columns) <= 1: 80 gold.append(tagged_word) 81 tagged_word=[] 82 else: 83 if finetag: 84 word_tag = (columns[1],columns[4]) 85 else: 86 word_tag = (columns[1],columns[3]) 87 tagged_word.append((word_tag)) 88 return gold
89