1 '''
2 Created on 19/05/2014
3
4 @author: david.vilares
5 '''
6
7 import re
8 from miopia.analyzer.counter.RawCounter import RawCounter
9
11 '''
12 This counter counts some ungrammatical phenomena thta may be present in a text.
13 '''
14
15 REPLICATIONS= "REPLICATIONS"
16 HASHTAGS="HASHTAGS"
17 WORDS_CAPILATISED="WORDS_CAPITALISED"
18
19
29
31 raise NotImplementedError
32
33
34 - def _count(self, list_text_info):
35 """
36 @param list_text_info: A list of L{TextInfo} objects
37 """
38 dict_features = {}
39 for text_info in list_text_info:
40 textid = text_info.get_textid()
41 text_tokens = text_info.get_text().split()
42
43 dict_phenomena= {phenomenon:0 for phenomenon in self._phenomena}
44 for token in text_tokens:
45
46
47 if (token.upper() == token and self._word_regexp.findall(token) !=[]
48 and len(token) > 1):
49 dict_phenomena[self.WORDS_CAPILATISED] = dict_phenomena.get(self.WORDS_CAPILATISED, 0)+1
50
51 if token.startswith('#'):
52 dict_phenomena[self.HASHTAGS] = dict_phenomena.get(self.HASHTAGS, 0)+1
53
54 for phenomenon in self._phenomena:
55 try:
56 dict_features[self._id_of_feature(textid, -1,phenomenon)]+=dict_phenomena[phenomenon]
57 except KeyError:
58 dict_features[self._id_of_feature(textid, -1,phenomenon)]=dict_phenomena[phenomenon]
59 print text_info.get_text(),dict_phenomena
60 return dict_features
61