Source code for sussex_nltk.cmu
"""
The CMU module provides access to the Carnegie Mellon twitter tokenizer. It is
used internally by other modules in the `sussex_nltk` package and should no be
called directly.
.. codeauthor::
Matti Lyra
"""
import os
from subprocess import PIPE
import tempfile
from nltk.internals import find_jar, config_java, java
import sussex_nltk as susx
[docs]def tag(sents, java_options='-Xmx1g'):
"""Tokenizes a sentence using the CMU twitter tokenizer.
"""
_root = os.path.join(susx._sussex_root, 'CMU')
__cp = _root
jars = [jar for jar in os.listdir(_root) if jar.endswith('.jar')]
for jar_name in jars:
__cp += ':%s'%find_jar(jar_name, path_to_jar=os.path.join(_root,jar_name))
# write the sentences to the temp file
_input_fh, _input_file_path = tempfile.mkstemp(text=True)
_input_fh = os.fdopen(_input_fh, 'w')
_input = '\n'.join(x.strip() for x in sents)
_input = _input.encode('UTF-8')
_input_fh.write(_input)
_input_fh.close()
_output_fh, _output_file_path = tempfile.mkstemp(text=True)
config_java(options=java_options, verbose=False)
_cmd = ['edu.cmu.cs.lti.ark.tweetnlp.RunPOSTagger',
'--input', _input_file_path,
'--output', _output_file_path]
_dir = os.getcwd()
os.chdir(_root)
java(_cmd, classpath=__cp, stdout=PIPE, stderr=PIPE)
os.chdir(_dir)
_output_fh = os.fdopen(_output_fh, 'r')
_output_data = _output_fh.read()
os.unlink(_input_file_path)
os.unlink(_output_file_path)
return _output_data