1 '''
2 @author: David Vilares Calvo
3 '''
4
5 import re
6 import codecs
7 from miopia.preparator.LexicalSentimentInfo import LexicalSentimentInfo, LexicalValenceShifter
8 from miopia.util.ConfigurationManager import ConfigurationManager
9 import xml.etree.ElementTree as ET
10
11
13 '''
14 classdocs
15 '''
16
17 - def __init__(self, sentence_tokenizer, tokenizer, tagger,
18 vocabulary_set=[]):
19 '''
20 Constructor
21 @param sentence_tokenizer: Use tokenizers/punkt/spanish.pickle from nltk_data
22 @param tokenizer: An instance of L{nltk.tokenize.punkt import PunktWordTokenizer}
23 @param tagger: Use spanish_brill.pickle (after unserialize) included in this package
24 @param vocabulary_set: A Python set with the vocabulary
25 '''
26 self._sentence_tokenizer = sentence_tokenizer
27 self._tokenizer = tokenizer
28 self._tagger = tagger
29 self._vocabulary_set = vocabulary_set
30 '''
31 if vocabulary_set is None:
32 self._vocabulary_set = self._build_vocabulary_set(ConfigurationManager().getParameter("path_vocabulary_set"))
33 else:
34 self._vocabulary_set = vocabulary_set
35 '''
36
37
39 return self._vocabulary_set
40
41
43 """
44 @param path_to_file: A path to the file with the vocabulary of words.
45
46 Example of the structure of a vocabulary file:
47 Word1
48 Word2
49 ...
50 WordN
51 """
52 return set( codecs.open(path_to_file,encoding='utf-8').read().split())
53
54
55
57 """
58 @param text: A String
59 @return: A list of strings with the sentences of the text
60 """
61 return self._sentence_tokenizer.tokenize(text)
62
63
64
66 """
67 @param token: A String
68 @return: A list with the repeated chars in the token
69 """
70
71 try:
72 float(token)
73 return []
74 except:
75 matcher = re.compile(r'(.)\1*')
76 replications = [match.group() for match in matcher.finditer(token)
77 if len(match.group())>=2]
78 return replications
79
80
81
83 """
84 @param token: A String
85 @param replications: A list with the replicated chars of a token
86 @return: A valid word in the vocabulary if it exits, otherwise returns
87 the original word.
88 """
89 combinations = []
90 c = ""
91
92 if token not in self._vocabulary_set and token.lower() not in self._vocabulary_set:
93
94 for r in replications:
95
96 for i in reversed(range(1,3)):
97 c = token.replace(r,r[:i])
98
99 if (c in self._vocabulary_set
100 or c.lower() in self._vocabulary_set):
101 combinations.append(c)
102 for r2 in replications:
103 if r2 is not r:
104 for j in reversed(range(1,3)):
105 if ((c.replace(r2,r2[:j]) in self._vocabulary_set
106 or c.replace(r2,r2[:j]).lower() in self._vocabulary_set)
107 and c.replace(r2,r2[:j]) not in combinations):
108 combinations.append(c.replace(r2,r2[:j]))
109 if combinations == []:
110 return token
111 else:
112 return sorted(combinations,key=len)[0]
113
114
116 """
117 @param token: A String
118 @return: True if words is a complete capitalized word, False otherwise
119 """
120
121 return (token.isupper() and token not in self._vocabulary_set)
122
123
124
126 """
127 @param replications: A list with the replicated chars of a token
128 @return: True if there are three or more replicated chars, False otherwise
129 """
130 longest_replication = sorted(replications,key=len,reverse=True)[0]
131 return ( len(longest_replication) >= 3 and
132 longest_replication != '...')
133
134
135
136
138 """
139 @sentences: A list of the sentences of a given text
140 @return: A list of lists with the token tokens and
141 a L{LexicalSentimentInfo} instance with the lexical
142 sentiment information for the text. L{LexicalSentimentInfo}
143 is None If no lexical sentiment info is found.
144 """
145 lsi = None
146 sentences_tokenized = []
147 processed_tokens = []
148 id_sentence = 1
149
150 for s in sentences:
151 tokens = self._tokenizer.tokenize(s)
152 id_token = 1
153 for t in tokens:
154
155 if self._is_upper_intesification(t):
156 if lsi is None: lsi = LexicalSentimentInfo()
157 t = t.lower()
158 lsi.add_lexical_valence_shifter(id_sentence,
159 id_token,
160 LexicalValenceShifter.CAPS)
161
162 replications = self._replications(t)
163 if replications != []:
164
165 normalized_t = self._eliminate_replications(t, replications)
166 t = normalized_t
167 if self._is_intensifier_replication(replications):
168 if lsi is None: lsi = LexicalSentimentInfo()
169 lsi.add_lexical_valence_shifter(id_sentence,
170 id_token,
171 LexicalValenceShifter.REPLICATION)
172 id_token += 1
173 processed_tokens.append(t)
174 sentences_tokenized.append(processed_tokens)
175 processed_tokens =[]
176 id_sentence += 1
177 return (sentences_tokenized, lsi)
178
179
180
182 """
183 @param tokenized_sentences: A list of lists of tokens
184 @return A list of tagged sentences. Each tagged sentence is a list of tuples (token,L{INfoTag})
185 """
186 return [self._tagger.tag(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
187
188
189
191 """
192 Writes in path_dest a XML representation of the L{LexicalSentimentInfo}
193 of the file
194 @param path_dest: A path to the destination XML file
195 @param dict_of_lsi: A dictionary of L{LexicalSentimentInfo}
196 """
197
198 def create_element(parent,element_name,text):
199 element = ET.SubElement(parent, element_name)
200 element.text = text
201 return element
202
203
204 files = ET.Element('files')
205 for lsi_file in dict_of_lsi.keys():
206 if dict_of_lsi[lsi_file] is not None:
207 file_element = ET.SubElement(files, 'file')
208 create_element(file_element, 'fileid',lsi_file)
209 sentences = ET.SubElement(file_element, 'sentences')
210 for sentence_key in dict_of_lsi[lsi_file].get_dict():
211 sentence = ET.SubElement(sentences,'sentence')
212 create_element(sentence,'sentenceid',str(sentence_key))
213 words = ET.SubElement(sentence,'words')
214 word_keys = sentence_key, dict_of_lsi[lsi_file].get_dict()[sentence_key]
215 for item in word_keys[1].items():
216 word = ET.SubElement(words,'word')
217 create_element(word,'wordid',str(item[0]))
218 create_element(word,'phenomena',','.join(item[1]))
219 f = open(path_dest,'w')
220 ET.ElementTree(files).write(path_dest, 'UTF-8')
221 f.close()
222
223
224
226 """
227 @param input_path: A path to A XML file with the lexical sentiment
228 info needed to build a dict of L{LexicalSentimentInfo}
229 @return: A dictionary of L{LexicalSentimentInfo}
230 """
231 tree = ET.parse(input_path)
232 root = tree.getroot()
233 dict_of_lsi = {}
234 files = root.findall('file')
235
236 switch = {'CAPS': LexicalValenceShifter.CAPS,
237 'REPLICATION': LexicalValenceShifter.REPLICATION}
238
239 for f in files:
240 sentences = f.find('sentences').findall('sentence')
241 lsi = LexicalSentimentInfo()
242 for s in sentences:
243 words = s.find('words').findall('word')
244 for w in words:
245 phenomena = w.find('phenomena').text.split(',')
246 for phenomenon in phenomena:
247 lsi.add_lexical_valence_shifter(int(s.find('sentenceid').text),
248 int(w.find('wordid').text),
249 switch[phenomenon])
250 dict_of_lsi[f.find('fileid').text] = lsi
251 return dict_of_lsi
252