Package miopia :: Package analyzer :: Package counter :: Module NGramCounter
[hide private]
[frames] | no frames]

Source Code for Module miopia.analyzer.counter.NGramCounter

 1  ''' 
 2  Created on 25/02/2014 
 3   
 4  @author: david.vilares 
 5  ''' 
 6  from miopia.analyzer.counter.RawCounter import RawCounter 
 7  from miopia.util.TokenInfo import TokenInfo 
 8  from nltk.util import ngrams 
 9   
10   
11  import itertools 
12   
13 -class NGramCounter(RawCounter):
14 ''' 15 classdocs 16 ''' 17
18 - def __init__(self,ftc, preprocessor, lexical_processor, back_off, stop_words=set([]), 19 lowercase=True):
20 ''' 21 @param ftc: An instance of L{FeatureTypeConfiguration} 22 @param preprocessor: An instance of L{PreprocessorI} 23 @param lexical_processor: An instance of L{LexicalProcessor} 24 @param back_off: An instance of L{BackOff} 25 @param stop_words: A set of words that shoudn't be taken into account 26 @param lowercase: A boolean. True to ignore capitalised characters. 27 ''' 28 super(NGramCounter,self).__init__(ftc,lowercase) 29 self._ngram_size = int(ftc.get_n_gram()) 30 self._preprocessor = preprocessor 31 self._lexical_processor = lexical_processor 32 self._back_off = back_off 33 self._stop_words = stop_words
34 35
36 - def _count(self, list_text_info):
37 """ 38 @param list_text_info: A list of L{TextInfo} objects 39 """ 40 dict_features = {} 41 tags = self.raw_processing(list_text_info) 42 43 list_textid_list_token_info =[] 44 for textid, list_sentences_word_tag in tags: 45 l_tokeninfo_text = [] 46 for sentence_word_tag in list_sentences_word_tag: 47 l_tokeninfo_text.extend([TokenInfo(self._filter(word),None, infotag.get_cpostag(), infotag.get_postag()) 48 for word,infotag in sentence_word_tag 49 if word not in self._stop_words]) 50 51 list_textid_list_token_info.append((textid, l_tokeninfo_text)) 52 53 for text_id, list_token_info in list_textid_list_token_info: 54 text_grams = [self._back_off_n_grams(gram, self._ftc.get_n_gram_back_off()) 55 for gram in ngrams( list_token_info, self._ngram_size)] 56 57 position=0 58 for gram_list in text_grams: 59 for gram in gram_list: 60 try: 61 dict_features[self._id_of_feature(text_id,position,gram)]+= 1. 62 except KeyError: 63 dict_features[self._id_of_feature(text_id,position,gram)] = 1. 64 position+=1 65 return dict_features
66 67
68 - def _filter(self,string):
69 # print self._stop_words 70 for stop_word in self._stop_words: 71 if stop_word in string: 72 print stop_word 73 return string.replace(stop_word,"") 74 return string.lower() if self._lowercase else string
75
76 - def _back_off_n_grams(self, gram, type_back_off):
77 """ 78 @gram: A tuple where each element is an element of the gram 79 @type_back_off: A L{FeatureLevelBackOff} 80 """ 81 back_off_gram_combinations = [[self._back_off.back_off(token_info, back_off)] 82 if type(self._back_off.back_off(token_info, back_off)) != type([]) 83 else self._back_off.back_off(token_info, back_off) 84 for token_info,back_off in zip(gram,type_back_off.split(self._back_off.NGRAM_BACK_OFF_DELIMITER))] 85 list_back_off_combinations = itertools.product(*back_off_gram_combinations) 86 return ['_'.join(back_off_combination) 87 for back_off_combination in list_back_off_combinations]
88