Source code for sussex_nltk.tag
"""
The tag module provides access to the Stanford and Carnegie Mellon twitter
part-of-speech taggers.
The Stanford tagger has four different models trained on data that has
been preprocessed differently.
* *wsj-0-18-bidirectional-distsim.tagger* Trained on WSJ sections 0-18 using a
bidirectional architecture and including word shape and distributional
similarity features.
Penn Treebank tagset.
Performance: 97.28% correct on WSJ 19-21 (90.46% correct on unknown words)
* *wsj-0-18-left3words.tagger* Trained on WSJ sections 0-18 using the left3words
architecture and includes word shape features.
Penn tagset.
Performance: 96.97% correct on WSJ 19-21 (88.85% correct on unknown words)
* *english-left3words-distsim.tagger* Trained on WSJ sections 0-18 and extra
parser training data using the left3words architecture and includes word
shape and distributional similarity features.
Penn tagset.
* *english-bidirectional-distsim.tagger* Trained on WSJ sections 0-18 using a
bidirectional architecture and including word shape and distributional
similarity features.
Penn Treebank tagset.
.. codeauthor::
Matti Lyra
"""
import os
import cmu
from nltk.tag import stanford
import sussex_nltk as susx
_stanford_models = {
'wsj-0-18-bidirectional-distsim':os.path.join(susx._sussex_root, 'stanford/models/wsj-0-18-bidirectional-distsim.tagger'),
'wsj-0-18-left3words':os.path.join(susx._sussex_root, 'stanford/models/wsj-0-18-left3words.tagger'),
'english-left3words-distsim':os.path.join(susx._sussex_root, 'stanford/models/english-left3words-distsim.tagger'),
'english-bidirectional-distsim':os.path.join(susx._sussex_root, 'stanford/models/english-bidirectional-distsim.tagger'),
}
[docs]def stanford_tag(sent, model='wsj-0-18-bidirectional-distsim'):
"""Uses the Standorf POS tagger to tag a sentence.
*model* should be one of ``'wsj-bidirectional-distsim'``,
``'wsj-left3words-distsim'``, ``'wsj-bidirectional'``, ``'wsj-left3words'``.
"""
model = _stanford_models[model]
tagger = stanford.POSTagger(
path_to_model=model,
path_to_jar='/usr/local/scratch/LanguageEngineering/stanford/stanford-postagger.jar')
return tagger.tag( [sent] )
[docs]def stanford_tag_batch(sents, model=None):
tagger = stanford.POSTagger(
path_to_model='/usr/local/scratch/LanguageEngineering/stanford/models/english-bidirectional-distsim.tagger',
path_to_jar='/usr/local/scratch/LanguageEngineering/stanford/stanford-postagger.jar')
return tagger.batch_tag( sents )