'''
Created on 25/02/2014

@author: david.vilares
'''
from miopia.analyzer.counter.RawCounter import RawCounter
from miopia.util.TokenInfo import TokenInfo
from nltk.util import ngrams


import itertools

class NGramCounter(RawCounter):
    '''
    classdocs
    '''

    def __init__(self,ftc, preprocessor, lexical_processor, back_off, stop_words=set([]),
                 lowercase=True):
        '''
        @param ftc: An instance of L{FeatureTypeConfiguration}
        @param preprocessor: An instance of L{PreprocessorI}
        @param lexical_processor: An instance of L{LexicalProcessor}
        @param back_off: An instance of L{BackOff} 
        @param stop_words: A set of words that shoudn't be taken into account
        @param lowercase: A boolean. True to ignore capitalised characters.
        '''
        super(NGramCounter,self).__init__(ftc,lowercase)
        self._ngram_size = int(ftc.get_n_gram())
        self._preprocessor = preprocessor
        self._lexical_processor = lexical_processor 
        self._back_off = back_off
        self._stop_words = stop_words


    def _count(self, list_text_info):
        """
        @param list_text_info: A list of L{TextInfo} objects
        """
        dict_features = {}
        tags = self.raw_processing(list_text_info)

        list_textid_list_token_info =[]
        for textid, list_sentences_word_tag in tags:
            l_tokeninfo_text = []
            for sentence_word_tag in list_sentences_word_tag:
                l_tokeninfo_text.extend([TokenInfo(self._filter(word),None, infotag.get_cpostag(), infotag.get_postag()) 
                                         for word,infotag in sentence_word_tag
                                         if word not in self._stop_words])
                
            list_textid_list_token_info.append((textid, l_tokeninfo_text))

        for text_id, list_token_info in list_textid_list_token_info:
            text_grams = [self._back_off_n_grams(gram, self._ftc.get_n_gram_back_off())
                          for gram in ngrams( list_token_info, self._ngram_size)]
     
            position=0
            for gram_list in text_grams:
                for gram in gram_list:
                    try:
                        dict_features[self._id_of_feature(text_id,position,gram)]+= 1.
                    except KeyError:
                        dict_features[self._id_of_feature(text_id,position,gram)] = 1.
                position+=1
        return dict_features       
    

    def _filter(self,string):
       # print self._stop_words
        for stop_word in self._stop_words:
            if stop_word in string: 
                print stop_word
                return string.replace(stop_word,"")
        return string.lower() if self._lowercase else string

    def _back_off_n_grams(self, gram, type_back_off):
        """
        @gram: A tuple where each element is an element of the gram
        @type_back_off: A L{FeatureLevelBackOff}
        """
        back_off_gram_combinations = [[self._back_off.back_off(token_info, back_off)] 
                                      if type(self._back_off.back_off(token_info, back_off)) != type([])
                                      else self._back_off.back_off(token_info, back_off) 
                                      for token_info,back_off in zip(gram,type_back_off.split(self._back_off.NGRAM_BACK_OFF_DELIMITER))]
        list_back_off_combinations = itertools.product(*back_off_gram_combinations)
        return ['_'.join(back_off_combination)
                for back_off_combination in list_back_off_combinations]
        
            
    
