1 '''
2 @author: David Vilares Calvo
3 '''
4 from AbstractTagger import AbstractTagger
5 from nltk.tag.sequential import DefaultTagger
6 from nltk.tag.sequential import AffixTagger
7 from nltk.tag import brill
8 from InfoTag import *
9
11 """
12 Wrapper of the NLTK Brill tagger. It builds a Brill-based tagger.
13 """
14 - def __init__(self,training_set,test_set, list_initial_taggers,
15 back_off_tagger,fine_tag=True):
16 """
17 Constructor
18 @param training_set: A list of [(word,tag)]
19 @param test_test: A list of [(word,tag)]
20 @param list_initial_taggers: A list of taggers of type L{nltk.tag.sequential}
21 @param back_off_tagger: A back off tagger of initial tagger. None for not to include any back-off tagger
22 @param fineTag: A boolean. True for using Fine PoS-tags, False for Coarse PoS-tags
23 """
24 initialTagger = self._backoff_tagger(training_set,list_initial_taggers, backoff=back_off_tagger)
25 brillTagger= self._train_brill_tagger(initialTagger,training_set)
26 self.__tagger = brillTagger
27 self.__accuracy = self.__tagger.evaluate(test_set)
28
30 """
31 @return: The theoretical accuracy of the trained tagger
32 """
33 return self.__accuracy
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
57 for cls in tagger_classes:
58 backoff=cls(train_sets,backoff=backoff)
59 return backoff
60
61
63 sym_bounds=[(1,1),(2,2),(1,2),(1,3)]
64 asym_bounds=[(-1,1),(1,1)]
65 templates=[
66 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,*sym_bounds),
67 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,*sym_bounds),
68 brill.ProximateTokensTemplate(brill.ProximateTagsRule,*asym_bounds),
69 brill.ProximateTokensTemplate(brill.ProximateWordsRule,*asym_bounds)
70 ]
71 trainer = brill.FastBrillTaggerTrainer(initial_tagger,templates,deterministic=True)
72 return trainer.train(train_sents, **kwargs)
73
74
76 """
77 @param postag: A fine part-of-speech tag according with the regexp cpostag:nameofFeat_valueofFeat-nameofFeat-...
78 @return An L{InfoTag}
79 """
80 feats=""
81
82 if len(postag) ==1:
83 return InfoTag(postag,postag,"_")
84 else:
85 cpostag = postag.split(":")[0]
86 featsList = postag.split(":")[1].split("-")
87 for f in featsList:
88 featkey = f.split("_")[0]
89 featvalue = f.split("_")[1]
90 feats= feats+featkey+"="+featvalue+"|"
91 return InfoTag(cpostag,postag,feats[0:len(feats)-1])
92
93
94 - def tag(self, tokens):
95 """
96 @param tokens: A list of tokens. Each token is a string.
97 @return: A list of tuples (word,L{InfoTag})
98 """
99 return self._tag(tokens)
100
101
102 - def _tag(self, tokens):
103 """
104 @param tokens: A list of tokens
105 @return: A list of tuples (word,L{InfoTag})
106 """
107 tokens_tagged = self.__tagger.tag(tokens)
108 tokens_info = []
109 for tg in tokens_tagged:
110 tokens_info.append((tg[0], self._fine_tag_to_cposfeats(tg[1]) ))
111 return tokens_info
112