1 '''
2 Created on 25/02/2014
3
4 @author: david.vilares
5 '''
6 from miopia.analyzer.counter.RawCounter import RawCounter
7 from miopia.util.TokenInfo import TokenInfo
8 from nltk.util import ngrams
9
10
11 import itertools
12
14 '''
15 classdocs
16 '''
17
18 - def __init__(self,ftc, preprocessor, lexical_processor, back_off, stop_words=set([]),
19 lowercase=True):
20 '''
21 @param ftc: An instance of L{FeatureTypeConfiguration}
22 @param preprocessor: An instance of L{PreprocessorI}
23 @param lexical_processor: An instance of L{LexicalProcessor}
24 @param back_off: An instance of L{BackOff}
25 @param stop_words: A set of words that shoudn't be taken into account
26 @param lowercase: A boolean. True to ignore capitalised characters.
27 '''
28 super(NGramCounter,self).__init__(ftc,lowercase)
29 self._ngram_size = int(ftc.get_n_gram())
30 self._preprocessor = preprocessor
31 self._lexical_processor = lexical_processor
32 self._back_off = back_off
33 self._stop_words = stop_words
34
35
36 - def _count(self, list_text_info):
37 """
38 @param list_text_info: A list of L{TextInfo} objects
39 """
40 dict_features = {}
41 tags = self.raw_processing(list_text_info)
42
43 list_textid_list_token_info =[]
44 for textid, list_sentences_word_tag in tags:
45 l_tokeninfo_text = []
46 for sentence_word_tag in list_sentences_word_tag:
47 l_tokeninfo_text.extend([TokenInfo(self._filter(word),None, infotag.get_cpostag(), infotag.get_postag())
48 for word,infotag in sentence_word_tag
49 if word not in self._stop_words])
50
51 list_textid_list_token_info.append((textid, l_tokeninfo_text))
52
53 for text_id, list_token_info in list_textid_list_token_info:
54 text_grams = [self._back_off_n_grams(gram, self._ftc.get_n_gram_back_off())
55 for gram in ngrams( list_token_info, self._ngram_size)]
56
57 position=0
58 for gram_list in text_grams:
59 for gram in gram_list:
60 try:
61 dict_features[self._id_of_feature(text_id,position,gram)]+= 1.
62 except KeyError:
63 dict_features[self._id_of_feature(text_id,position,gram)] = 1.
64 position+=1
65 return dict_features
66
67
69
70 for stop_word in self._stop_words:
71 if stop_word in string:
72 print stop_word
73 return string.replace(stop_word,"")
74 return string.lower() if self._lowercase else string
75
77 """
78 @gram: A tuple where each element is an element of the gram
79 @type_back_off: A L{FeatureLevelBackOff}
80 """
81 back_off_gram_combinations = [[self._back_off.back_off(token_info, back_off)]
82 if type(self._back_off.back_off(token_info, back_off)) != type([])
83 else self._back_off.back_off(token_info, back_off)
84 for token_info,back_off in zip(gram,type_back_off.split(self._back_off.NGRAM_BACK_OFF_DELIMITER))]
85 list_back_off_combinations = itertools.product(*back_off_gram_combinations)
86 return ['_'.join(back_off_combination)
87 for back_off_combination in list_back_off_combinations]
88