Package miopia :: Package preprocessor :: Module HashTagProcessor
[hide private]
[frames] | no frames]

Source Code for Module miopia.preprocessor.HashTagProcessor

  1  #-*- coding: utf-8 -*- 
  2   
  3  ''' 
  4  @author: David Vilares Calvo 
  5  ''' 
  6  import re 
  7  from miopia.preprocessor.PreProcessorDecorator import PreProcessorDecorator 
  8  from time import time 
  9   
 10   
11 -class HashTagProcessor(PreProcessorDecorator):
12 ''' 13 classdocs 14 ''' 15
16 - def __init__(self,component, remove_bordering=False):
17 ''' 18 Constructor 19 @param component: An instance defined by the interface L{PreProcessorI} 20 ''' 21 self._component = component 22 self._remove_bordering = remove_bordering 23 self._hashtag_pattern = re.compile(r'(#\w{1,})')
24 25
26 - def preprocess(self,text):
27 """ 28 @param text: A tweet as a string 29 @return A hashtag processed tweet 30 """ 31 return self._component.preprocess(self._process_hashtag(text))
32 33
34 - def is_bordering_hashtag(self,text,hashtag,nocurrence):
35 """ 36 @return: 0 if is not a bordering hashtag, -1 if it's an initial hashtag and 1 if it's an ending hashtag 37 """ 38 #TODO: Repeated hashtags 39 text_splitted = text.split() 40 code = -1 41 for token in text_splitted: 42 if not token.startswith('#'): 43 code = 0 44 # if code == 1: break 45 if (token.lower() == hashtag.lower()): 46 if code == -1 : 47 return code 48 else: 49 code = 1 50 return code
51 52 53
54 - def _process_hashtag(self,text):
55 """ 56 @param text: A tweet. It is a String 57 Eliminate the complete hashtag if there is at the beginning 58 or the end of the text. Otherwise, only delete '#' symbol. 59 Special hashtags as #FF always represented without #, never deleted. 60 """ 61 set_special_hashtags = (['#FF','#ff']) 62 #TODO: #ff (and #FF) capitalised but not must count as an intensifier 63 # start = time() 64 #hashtags = re.findall(r'(#\w{1,})',text) 65 hashtags = self._hashtag_pattern.findall(text) 66 initial_text = text 67 first_ending_hashtag = True 68 69 ocurrences = {hashtag:0 for hashtag in hashtags} 70 for hashtag in hashtags: 71 ocurrences[hashtag]+=1 72 73 is_bordering = self.is_bordering_hashtag(text, hashtag,ocurrences[hashtag]) 74 if is_bordering: 75 if not self._remove_bordering: 76 text= self._build_new_text(text, hashtag, hashtag[1:], maxreplace=1) 77 if first_ending_hashtag and is_bordering ==1: 78 first_ending_hashtag = False 79 else: 80 text = text.replace(hashtag,'') 81 else: 82 if hashtag in set_special_hashtags: 83 text = text.replace(hashtag,hashtag[1:].upper()) 84 else: 85 text = text.replace(hashtag, hashtag[1:]) 86 return text
87 88 89 90 # def _process_hashtag(self,text): 91 # """ 92 # @param text: A tweet. It is a String 93 # Eliminate the complete hashtag if there is at the beginning 94 # or the end of the text. Otherwise, only delete '#' symbol. 95 # Special hashtags as #FF always represented without #, never deleted. 96 # """ 97 # set_special_hashtags = (['#FF','#ff']) 98 # #TODO: #ff (and #FF) capitalised but not must count as an intensifier 99 ## start = time() 100 # #hashtags = re.findall(r'(#\w{1,})',text) 101 # hashtags = self._hashtag_pattern.findall(text) 102 # initial_text = text 103 # for hashtag in hashtags: 104 # print "proof: ",self.is_bordering_hashtag(text, hashtag),"|" , hashtag, "|" ,text 105 # if ((text.startswith(hashtag) or text.endswith(hashtag)) 106 # and hashtag.upper() not in set_special_hashtags): 107 ## if not self._remove_bordering: 108 ## text = text.replace(hashtag,self._build_new_sentence(hashtag[1:])) 109 ## else: 110 ## print "entra aqui" 111 ## text = text.replace(hashtag,'') 112 # pass 113 # else: 114 # if hashtag in set_special_hashtags: 115 # text = text.replace(hashtag,hashtag[1:].upper()) 116 # else: 117 # text = text.replace(hashtag, hashtag[1:]) 118 ## print initial_text 119 ## print text 120 # return text 121 # 122