1
2
3 '''
4 @author: David Vilares Calvo
5 '''
6 import re
7 from miopia.preprocessor.PreProcessorDecorator import PreProcessorDecorator
8 from time import time
9
10
12 '''
13 classdocs
14 '''
15
16 - def __init__(self,component, remove_bordering=False):
17 '''
18 Constructor
19 @param component: An instance defined by the interface L{PreProcessorI}
20 '''
21 self._component = component
22 self._remove_bordering = remove_bordering
23 self._hashtag_pattern = re.compile(r'(#\w{1,})')
24
25
27 """
28 @param text: A tweet as a string
29 @return A hashtag processed tweet
30 """
31 return self._component.preprocess(self._process_hashtag(text))
32
33
35 """
36 @return: 0 if is not a bordering hashtag, -1 if it's an initial hashtag and 1 if it's an ending hashtag
37 """
38
39 text_splitted = text.split()
40 code = -1
41 for token in text_splitted:
42 if not token.startswith('#'):
43 code = 0
44
45 if (token.lower() == hashtag.lower()):
46 if code == -1 :
47 return code
48 else:
49 code = 1
50 return code
51
52
53
55 """
56 @param text: A tweet. It is a String
57 Eliminate the complete hashtag if there is at the beginning
58 or the end of the text. Otherwise, only delete '#' symbol.
59 Special hashtags as #FF always represented without #, never deleted.
60 """
61 set_special_hashtags = (['#FF','#ff'])
62
63
64
65 hashtags = self._hashtag_pattern.findall(text)
66 initial_text = text
67 first_ending_hashtag = True
68
69 ocurrences = {hashtag:0 for hashtag in hashtags}
70 for hashtag in hashtags:
71 ocurrences[hashtag]+=1
72
73 is_bordering = self.is_bordering_hashtag(text, hashtag,ocurrences[hashtag])
74 if is_bordering:
75 if not self._remove_bordering:
76 text= self._build_new_text(text, hashtag, hashtag[1:], maxreplace=1)
77 if first_ending_hashtag and is_bordering ==1:
78 first_ending_hashtag = False
79 else:
80 text = text.replace(hashtag,'')
81 else:
82 if hashtag in set_special_hashtags:
83 text = text.replace(hashtag,hashtag[1:].upper())
84 else:
85 text = text.replace(hashtag, hashtag[1:])
86 return text
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122