Package miopia :: Package preprocessor :: Module EmoticonPreProcessor
[hide private]
[frames] | no frames]

Source Code for Module miopia.preprocessor.EmoticonPreProcessor

 1  #-*- coding: utf-8 -*-  
 2   
 3  ''' 
 4  @author: David Vilares Calvo 
 5  ''' 
 6   
 7  import codecs 
 8  from miopia.util.ConfigurationManager import ConfigurationManager 
 9  from miopia.preprocessor.PreProcessorDecorator import PreProcessorDecorator 
10   
11   
12 -class EmoticonPreProcessor(PreProcessorDecorator):
13 ''' 14 classdocs 15 ''' 16
17 - def __init__(self,component, emoticon_dict=None):
18 ''' 19 Constructor 20 ''' 21 self._component = component 22 if emoticon_dict is None: 23 self._emoticon_dict = self._get_emoticon_dict(ConfigurationManager().getParameter("path_emoticon_dict")) 24 else: 25 self._emoticon_dict = emoticon_dict
26 27
28 - def _get_emoticon_dict(self,name_of_dict,coding="utf-8"):
29 """ 30 @param name_of_dict: The path to a semantic orientiation diccionary 31 @return: A dictionary {word,semantic orientation value} 32 """ 33 words = codecs.open(name_of_dict,encoding=coding).readlines() 34 D = {} 35 for word in words: 36 columns = word.split() 37 length = len(columns) - 1 38 39 for i in range(0, length-1): 40 D[columns[i]] = columns[length].replace("\n","") 41 return D
42 43
44 - def preprocess(self,text):
45 """ 46 Preprocess emoticons and call preprocess component function 47 @param text: A String 48 """ 49 if isinstance(text, str): 50 text = unicode(text,'utf-8',errors="ignore") 51 elif isinstance(text, unicode): 52 pass 53 ptext = self._emoticon_transformation(text) 54 return self._component.preprocess(ptext)
55 56 57 58
59 - def _emoticon_transformation(self,text):
60 """ 61 @param text: A String 62 @return: A processed text where emoticons has been processed 63 """ 64 list_emoticon = self._emoticon_dict.keys() 65 emoticons_found = [] 66 67 # def build_new_sentence(string): 68 # if not string.startswith("."): 69 # string = ". "+string 70 # if not string.endswith("."): 71 # string =string+"." 72 # return string 73 74 for emoticon in list_emoticon: 75 if (emoticon in text) and emoticon not in emoticons_found: 76 emoticons_found.append(emoticon) 77 78 emoticons_found.sort(key=len,reverse=True) 79 for e in set(emoticons_found): 80 text = self._build_new_text(text, e, self._emoticon_dict[e]) 81 # text = text.replace(e, self._build_new_sentence(self._emoticon_dict[e])) 82 return text
83