Package miopia :: Package analyzer :: Package counter :: Module UnGrammaticalCounter
[hide private]
[frames] | no frames]

Source Code for Module miopia.analyzer.counter.UnGrammaticalCounter

 1  ''' 
 2  Created on 19/05/2014 
 3   
 4  @author: david.vilares 
 5  ''' 
 6   
 7  import re 
 8  from miopia.analyzer.counter.RawCounter import RawCounter 
 9   
10 -class UnGrammaticalCounter(RawCounter):
11 ''' 12 This counter counts some ungrammatical phenomena thta may be present in a text. 13 ''' 14 15 REPLICATIONS= "REPLICATIONS" 16 HASHTAGS="HASHTAGS" 17 WORDS_CAPILATISED="WORDS_CAPITALISED" 18 19
20 - def __init__(self,ftc):
21 ''' 22 @param ftc: An instance of L{FeatureTypeConfiguration} 23 ''' 24 super(UnGrammaticalCounter,self).__init__(ftc) 25 self._repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 26 self._word_regexp = re.compile(r'[a-zA-Z_]') 27 self._phenomena = set([self.REPLICATIONS, self.HASHTAGS, 28 self.WORDS_CAPILATISED])
29
30 - def raw_processing(self, list_text_info):
31 raise NotImplementedError
32 33
34 - def _count(self, list_text_info):
35 """ 36 @param list_text_info: A list of L{TextInfo} objects 37 """ 38 dict_features = {} 39 for text_info in list_text_info: 40 textid = text_info.get_textid() 41 text_tokens = text_info.get_text().split() 42 43 dict_phenomena= {phenomenon:0 for phenomenon in self._phenomena} 44 for token in text_tokens: 45 46 #THERE IS A CAPITALISED WORD 47 if (token.upper() == token and self._word_regexp.findall(token) !=[] 48 and len(token) > 1): 49 dict_phenomena[self.WORDS_CAPILATISED] = dict_phenomena.get(self.WORDS_CAPILATISED, 0)+1 50 #THERE IS A HASHTAG 51 if token.startswith('#'): 52 dict_phenomena[self.HASHTAGS] = dict_phenomena.get(self.HASHTAGS, 0)+1 53 54 for phenomenon in self._phenomena: 55 try: 56 dict_features[self._id_of_feature(textid, -1,phenomenon)]+=dict_phenomena[phenomenon] 57 except KeyError: 58 dict_features[self._id_of_feature(textid, -1,phenomenon)]=dict_phenomena[phenomenon] 59 print text_info.get_text(),dict_phenomena 60 return dict_features
61