Package miopia :: Package tagger :: Module BrillTagger
[hide private]
[frames] | no frames]

Source Code for Module miopia.tagger.BrillTagger

  1  ''' 
  2  @author: David Vilares Calvo 
  3  ''' 
  4  from AbstractTagger import AbstractTagger 
  5  from nltk.tag.sequential import DefaultTagger 
  6  from nltk.tag.sequential import AffixTagger 
  7  from nltk.tag import brill 
  8  from InfoTag import * 
  9   
10 -class BrillTagger(AbstractTagger):
11 """ 12 Wrapper of the NLTK Brill tagger. It builds a Brill-based tagger. 13 """
14 - def __init__(self,training_set,test_set, list_initial_taggers, 15 back_off_tagger,fine_tag=True):
16 """ 17 Constructor 18 @param training_set: A list of [(word,tag)] 19 @param test_test: A list of [(word,tag)] 20 @param list_initial_taggers: A list of taggers of type L{nltk.tag.sequential} 21 @param back_off_tagger: A back off tagger of initial tagger. None for not to include any back-off tagger 22 @param fineTag: A boolean. True for using Fine PoS-tags, False for Coarse PoS-tags 23 """ 24 initialTagger = self._backoff_tagger(training_set,list_initial_taggers, backoff=back_off_tagger) 25 brillTagger= self._train_brill_tagger(initialTagger,training_set) 26 self.__tagger = brillTagger 27 self.__accuracy = self.__tagger.evaluate(test_set)
28
29 - def get_accuracy(self):
30 """ 31 @return: The theoretical accuracy of the trained tagger 32 """ 33 return self.__accuracy
34 35 36 # def _get_backoff_tagger(self, fine_tag, t): 37 # ''' 38 # @param fine_tag: A boolean. True for fine tags, False for coarse tags. 39 # @param t: The training set, it's a list of (word,tags) 40 # @return: A back-off affix tagger 41 # ''' 42 # default_tagger = DefaultTagger(self._default_tag) 43 # pre1_tagger = AffixTagger(t,affix_length=1,backoff=default_tagger) 44 # pre2_tagger = AffixTagger(t,affix_length=2,backoff=pre1_tagger) 45 # pre3_tagger = AffixTagger(t,affix_length=3,backoff=pre2_tagger) 46 # pre4_tagger = AffixTagger(t,affix_length=4,backoff=pre3_tagger) 47 # pre5_tagger = AffixTagger(t,affix_length=5,backoff=pre4_tagger) 48 # suf3_tagger = AffixTagger(t,affix_length=-3,backoff=pre5_tagger) 49 # suf4_tagger = AffixTagger(t,affix_length=-4,backoff=suf3_tagger) 50 # suf5_tagger = AffixTagger(t,affix_length=-5,backoff=suf4_tagger) 51 # 52 # backoffTagger = suf5_tagger 53 # return backoffTagger 54 55
56 - def _backoff_tagger(self,train_sets, tagger_classes, backoff=None):
57 for cls in tagger_classes: 58 backoff=cls(train_sets,backoff=backoff) 59 return backoff
60 61
62 - def _train_brill_tagger(self,initial_tagger, train_sents, **kwargs):
63 sym_bounds=[(1,1),(2,2),(1,2),(1,3)] 64 asym_bounds=[(-1,1),(1,1)] 65 templates=[ 66 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,*sym_bounds), 67 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,*sym_bounds), 68 brill.ProximateTokensTemplate(brill.ProximateTagsRule,*asym_bounds), 69 brill.ProximateTokensTemplate(brill.ProximateWordsRule,*asym_bounds) 70 ] 71 trainer = brill.FastBrillTaggerTrainer(initial_tagger,templates,deterministic=True) 72 return trainer.train(train_sents, **kwargs)
73 74
75 - def _fine_tag_to_cposfeats(self,postag):
76 """ 77 @param postag: A fine part-of-speech tag according with the regexp cpostag:nameofFeat_valueofFeat-nameofFeat-... 78 @return An L{InfoTag} 79 """ 80 feats="" 81 #There are no feats 82 if len(postag) ==1: 83 return InfoTag(postag,postag,"_") 84 else: 85 cpostag = postag.split(":")[0] 86 featsList = postag.split(":")[1].split("-") 87 for f in featsList: 88 featkey = f.split("_")[0] 89 featvalue = f.split("_")[1] 90 feats= feats+featkey+"="+featvalue+"|" 91 return InfoTag(cpostag,postag,feats[0:len(feats)-1])
92 93
94 - def tag(self, tokens):
95 """ 96 @param tokens: A list of tokens. Each token is a string. 97 @return: A list of tuples (word,L{InfoTag}) 98 """ 99 return self._tag(tokens)
100 101
102 - def _tag(self, tokens):
103 """ 104 @param tokens: A list of tokens 105 @return: A list of tuples (word,L{InfoTag}) 106 """ 107 tokens_tagged = self.__tagger.tag(tokens) 108 tokens_info = [] 109 for tg in tokens_tagged: 110 tokens_info.append((tg[0], self._fine_tag_to_cposfeats(tg[1]) )) 111 return tokens_info
112