Source code for sussex_nltk.cmu

"""
The CMU module provides access to the Carnegie Mellon twitter tokenizer. It is
used internally by other modules in the `sussex_nltk` package and should no be
called directly.

.. codeauthor::
    Matti Lyra
"""

import os
from subprocess import PIPE
import tempfile

from nltk.internals import find_jar, config_java, java

import sussex_nltk as susx


[docs]def tag(sents, java_options='-Xmx1g'): """Tokenizes a sentence using the CMU twitter tokenizer. """ _root = os.path.join(susx._sussex_root, 'CMU') __cp = _root jars = [jar for jar in os.listdir(_root) if jar.endswith('.jar')] for jar_name in jars: __cp += ':%s'%find_jar(jar_name, path_to_jar=os.path.join(_root,jar_name)) # write the sentences to the temp file _input_fh, _input_file_path = tempfile.mkstemp(text=True) _input_fh = os.fdopen(_input_fh, 'w') _input = '\n'.join(x.strip() for x in sents) _input = _input.encode('UTF-8') _input_fh.write(_input) _input_fh.close() _output_fh, _output_file_path = tempfile.mkstemp(text=True) config_java(options=java_options, verbose=False) _cmd = ['edu.cmu.cs.lti.ark.tweetnlp.RunPOSTagger', '--input', _input_file_path, '--output', _output_file_path] _dir = os.getcwd() os.chdir(_root) java(_cmd, classpath=__cp, stdout=PIPE, stderr=PIPE) os.chdir(_dir) _output_fh = os.fdopen(_output_fh, 'r') _output_data = _output_fh.read() os.unlink(_input_file_path) os.unlink(_output_file_path) return _output_data