Source code for sussex_nltk.tag

"""
The tag module provides access to the Stanford and Carnegie Mellon twitter
part-of-speech taggers.

The Stanford tagger has four different models trained on data that has
been preprocessed differently. 

* *wsj-0-18-bidirectional-distsim.tagger* Trained on WSJ sections 0-18 using a
  bidirectional architecture and including word shape and distributional
  similarity features.

  Penn Treebank tagset.

  Performance: 97.28% correct on WSJ 19-21 (90.46% correct on unknown words)

* *wsj-0-18-left3words.tagger* Trained on WSJ sections 0-18 using the left3words
  architecture and includes word shape features.
  
  Penn tagset.
  
  Performance: 96.97% correct on WSJ 19-21 (88.85% correct on unknown words)

* *english-left3words-distsim.tagger* Trained on WSJ sections 0-18 and extra
  parser training data using the left3words architecture and includes word
  shape and distributional similarity features.
  
  Penn tagset.

* *english-bidirectional-distsim.tagger* Trained on WSJ sections 0-18 using a 
  bidirectional architecture and including word shape and distributional
  similarity features.
  
  Penn Treebank tagset.
    
.. codeauthor::
    Matti Lyra
"""

import os

import cmu
from nltk.tag import stanford
import sussex_nltk as susx


[docs]def twitter_tag_batch(sents): '''Tokenizes a list of sentences using the CMU twitter tokenizer. ''' _output_data = cmu.tag(sents) _output_tokens= [] for line in _output_data.split('\n'): token,_,pos_tag = line.partition('\t') _output_tokens.append( (token,pos_tag) ) return _output_tokens
[docs]def twitter_tag(sent): """Tokenizes a sentence using the CMU twitter tokenizer. """ return twitter_tag_batch([sent])
_stanford_models = { 'wsj-0-18-bidirectional-distsim':os.path.join(susx._sussex_root, 'stanford/models/wsj-0-18-bidirectional-distsim.tagger'), 'wsj-0-18-left3words':os.path.join(susx._sussex_root, 'stanford/models/wsj-0-18-left3words.tagger'), 'english-left3words-distsim':os.path.join(susx._sussex_root, 'stanford/models/english-left3words-distsim.tagger'), 'english-bidirectional-distsim':os.path.join(susx._sussex_root, 'stanford/models/english-bidirectional-distsim.tagger'), }
[docs]def stanford_tag(sent, model='wsj-0-18-bidirectional-distsim'): """Uses the Standorf POS tagger to tag a sentence. *model* should be one of ``'wsj-bidirectional-distsim'``, ``'wsj-left3words-distsim'``, ``'wsj-bidirectional'``, ``'wsj-left3words'``. """ model = _stanford_models[model] tagger = stanford.POSTagger( path_to_model=model, path_to_jar='/usr/local/scratch/LanguageEngineering/stanford/stanford-postagger.jar') return tagger.tag( [sent] )
[docs]def stanford_tag_batch(sents, model=None): tagger = stanford.POSTagger( path_to_model='/usr/local/scratch/LanguageEngineering/stanford/models/english-bidirectional-distsim.tagger', path_to_jar='/usr/local/scratch/LanguageEngineering/stanford/stanford-postagger.jar') return tagger.batch_tag( sents )