Source code for sussex_nltk.tag

"""
The tag module provides access to the Stanford and Carnegie Mellon twitter
part-of-speech taggers.

The Stanford tagger has four different models trained on data that has
been preprocessed differently. 

* *wsj-0-18-bidirectional-distsim.tagger* Trained on WSJ sections 0-18 using a
  bidirectional architecture and including word shape and distributional
  similarity features.

  Penn Treebank tagset.

  Performance: 97.28% correct on WSJ 19-21 (90.46% correct on unknown words)

* *wsj-0-18-left3words.tagger* Trained on WSJ sections 0-18 using the left3words
  architecture and includes word shape features.
  
  Penn tagset.
  
  Performance: 96.97% correct on WSJ 19-21 (88.85% correct on unknown words)

* *english-left3words-distsim.tagger* Trained on WSJ sections 0-18 and extra
  parser training data using the left3words architecture and includes word
  shape and distributional similarity features.
  
  Penn tagset.

* *english-bidirectional-distsim.tagger* Trained on WSJ sections 0-18 using a 
  bidirectional architecture and including word shape and distributional
  similarity features.
  
  Penn Treebank tagset.
    
.. codeauthor::
    Matti Lyra
"""

import os

import cmu
from nltk.tag import stanford
import sussex_nltk as susx


[docs]def twitter_tag_batch(sents):
    '''Tokenizes a list of sentences using the CMU twitter tokenizer.
    '''
    _output_data = cmu.tag(sents)
    _output_tokens= []
    for line in _output_data.split('\n'):
        token,_,pos_tag = line.partition('\t') 
        _output_tokens.append( (token,pos_tag) )
    
    return _output_tokens
    
[docs]def twitter_tag(sent):
    """Tokenizes a sentence using the CMU twitter tokenizer.
    """
    return twitter_tag_batch([sent])

_stanford_models = {
    'wsj-0-18-bidirectional-distsim':os.path.join(susx._sussex_root, 'stanford/models/wsj-0-18-bidirectional-distsim.tagger'),
    'wsj-0-18-left3words':os.path.join(susx._sussex_root, 'stanford/models/wsj-0-18-left3words.tagger'),
    'english-left3words-distsim':os.path.join(susx._sussex_root, 'stanford/models/english-left3words-distsim.tagger'),
    'english-bidirectional-distsim':os.path.join(susx._sussex_root, 'stanford/models/english-bidirectional-distsim.tagger'),
}

[docs]def stanford_tag(sent, model='wsj-0-18-bidirectional-distsim'):
    """Uses the Standorf POS tagger to tag a sentence.
    
    *model* should be one of ``'wsj-bidirectional-distsim'``,
    ``'wsj-left3words-distsim'``, ``'wsj-bidirectional'``, ``'wsj-left3words'``.
    """
    model = _stanford_models[model]
    tagger = stanford.POSTagger(
                path_to_model=model,
                path_to_jar='/usr/local/scratch/LanguageEngineering/stanford/stanford-postagger.jar')
    
    return tagger.tag( [sent] )

[docs]def stanford_tag_batch(sents, model=None):
    tagger = stanford.POSTagger(
                path_to_model='/usr/local/scratch/LanguageEngineering/stanford/models/english-bidirectional-distsim.tagger',
                path_to_jar='/usr/local/scratch/LanguageEngineering/stanford/stanford-postagger.jar')
    
    return tagger.batch_tag( sents )
Navigation

Source code for sussex_nltk.tag

Quick search

Navigation