#-*- coding: utf-8 -*- 
'''
@author: David Vilares Calvo
'''

import pickle

class AbstractTagger(object):
    """
    An abstract wrapper to build NLTK-based taggers
    """
    
    def dis_accentuate(self,l):
        '''
        @param l: A string.
        @return: l without any accents.
        '''      
        
        if 'á' in l:
            l = l.replace('á','a')
        if 'é' in l:
            l = l.replace('é','e')
        if 'í' in l:
            l = l.replace('í','i')
        if 'ó' in l:
            l = l.replace('ó','o')
        if 'ú' in l:
            l = l.replace('ú','u')
        return l
    

    def get_training_set(self,train_file, finetag=True):
        '''
        @precondition: train_file had to be created previously
        @param finetag: If true, get_training_set will return fine tags, not coarse tags
        @param train_file:  A path to a training set in CoNLL-X format
        @return: A list of [(word,tag)]
        '''
        t = []
        inputfile = open(train_file,'r')
        lines = inputfile.readlines()
        abs=[]
    #    abs_desacentuada = []
        
        for l in lines:
            columns = l.split('\t')
            if len(columns) <= 1:
                t.append(abs)
          #      t.append(abs_desacentuada)
                abs=[]
           #     abs_desacentuada=[]
            else:
                if finetag:
                    wordtag = (columns[1],columns[4])
           #         wordtag_d = (self.dis_accentuate(columns[1]),columns[4])
                else:
                    wordtag = (columns[1],columns[3])
            #        wordtag_d = (self.dis_accentuate(columns[1]),columns[3])
                abs.append((wordtag))
            #    abs_desacentuada.append(wordtag_d)
        return t


    def get_gold_set(self,gold_file, finetag=True):
        '''
        @precondition: goldFile had to be created previously
        @param finetag: If true, get_training_set will return fine tags, not coarse tags
        @param gold_file: A path to a test set in CoNLL-X format 
        @return: A list of [(word,tag)]
        '''
        gold = []
        input_file = open(gold_file,'r')
        lines = input_file.readlines()
        tagged_word = []
        tagged_word_d = []

        for l in lines:
            columns = l.split('\t')
            if len(columns) <= 1:
                gold.append(tagged_word)
                tagged_word=[]
            else:
                if finetag:
                    word_tag = (columns[1],columns[4])
                else:
                    word_tag = (columns[1],columns[3])
                tagged_word.append((word_tag))
        return gold