|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Natural Language Toolkit: Interface to the Stanford Segmenter
|
|
|
|
# for Chinese and Arabic
|
|
|
|
#
|
|
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
|
|
# Author: 52nlp <52nlpcn@gmail.com>
|
|
|
|
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
|
|
|
|
# Alex Constantin <alex@keyworder.ch>
|
|
|
|
#
|
|
|
|
# URL: <http://nltk.org/>
|
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
|
|
import tempfile
|
|
|
|
import os
|
|
|
|
import json
|
|
|
|
import warnings
|
|
|
|
from subprocess import PIPE
|
|
|
|
|
|
|
|
from nltk.internals import (
|
|
|
|
find_jar,
|
|
|
|
find_file,
|
|
|
|
find_dir,
|
|
|
|
config_java,
|
|
|
|
java,
|
|
|
|
_java_options,
|
|
|
|
)
|
|
|
|
from nltk.tokenize.api import TokenizerI
|
|
|
|
|
|
|
|
|
|
|
|
_stanford_url = "https://nlp.stanford.edu/software"
|
|
|
|
|
|
|
|
|
|
|
|
class StanfordSegmenter(TokenizerI):
|
|
|
|
"""Interface to the Stanford Segmenter
|
|
|
|
|
|
|
|
If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
|
|
|
|
should be provieded, for example::
|
|
|
|
|
|
|
|
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
|
|
|
|
|
|
|
|
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
|
|
|
|
>>> seg = StanfordSegmenter()
|
|
|
|
>>> seg.default_config('zh')
|
|
|
|
>>> sent = u'这是斯坦福中文分词器测试'
|
|
|
|
>>> print(seg.segment(sent))
|
|
|
|
\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
|
|
|
|
<BLANKLINE>
|
|
|
|
>>> seg.default_config('ar')
|
|
|
|
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
|
|
|
|
>>> print(seg.segment(sent.split()))
|
|
|
|
\u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
|
|
|
|
<BLANKLINE>
|
|
|
|
"""
|
|
|
|
|
|
|
|
_JAR = "stanford-segmenter.jar"
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
path_to_jar=None,
|
|
|
|
path_to_slf4j=None,
|
|
|
|
java_class=None,
|
|
|
|
path_to_model=None,
|
|
|
|
path_to_dict=None,
|
|
|
|
path_to_sihan_corpora_dict=None,
|
|
|
|
sihan_post_processing="false",
|
|
|
|
keep_whitespaces="false",
|
|
|
|
encoding="UTF-8",
|
|
|
|
options=None,
|
|
|
|
verbose=False,
|
|
|
|
java_options="-mx2g",
|
|
|
|
):
|
|
|
|
# Raise deprecation warning.
|
|
|
|
warnings.simplefilter("always", DeprecationWarning)
|
|
|
|
warnings.warn(
|
|
|
|
str(
|
|
|
|
"\nThe StanfordTokenizer will "
|
|
|
|
"be deprecated in version 3.2.5.\n"
|
|
|
|
"Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
|
|
|
|
),
|
|
|
|
DeprecationWarning,
|
|
|
|
stacklevel=2,
|
|
|
|
)
|
|
|
|
warnings.simplefilter("ignore", DeprecationWarning)
|
|
|
|
|
|
|
|
stanford_segmenter = find_jar(
|
|
|
|
self._JAR,
|
|
|
|
path_to_jar,
|
|
|
|
env_vars=("STANFORD_SEGMENTER",),
|
|
|
|
searchpath=(),
|
|
|
|
url=_stanford_url,
|
|
|
|
verbose=verbose,
|
|
|
|
)
|
|
|
|
if path_to_slf4j is not None:
|
|
|
|
slf4j = find_jar(
|
|
|
|
"slf4j-api.jar",
|
|
|
|
path_to_slf4j,
|
|
|
|
env_vars=("SLF4J", "STANFORD_SEGMENTER"),
|
|
|
|
searchpath=(),
|
|
|
|
url=_stanford_url,
|
|
|
|
verbose=verbose,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
slf4j = None
|
|
|
|
|
|
|
|
# This is passed to java as the -cp option, the old version of segmenter needs slf4j.
|
|
|
|
# The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
|
|
|
|
self._stanford_jar = os.pathsep.join(
|
|
|
|
_ for _ in [stanford_segmenter, slf4j] if _ is not None
|
|
|
|
)
|
|
|
|
|
|
|
|
self._java_class = java_class
|
|
|
|
self._model = path_to_model
|
|
|
|
self._sihan_corpora_dict = path_to_sihan_corpora_dict
|
|
|
|
self._sihan_post_processing = sihan_post_processing
|
|
|
|
self._keep_whitespaces = keep_whitespaces
|
|
|
|
self._dict = path_to_dict
|
|
|
|
|
|
|
|
self._encoding = encoding
|
|
|
|
self.java_options = java_options
|
|
|
|
options = {} if options is None else options
|
|
|
|
self._options_cmd = ",".join(
|
|
|
|
"{0}={1}".format(key, json.dumps(val)) for key, val in options.items()
|
|
|
|
)
|
|
|
|
|
|
|
|
def default_config(self, lang):
|
|
|
|
"""
|
|
|
|
Attempt to intialize Stanford Word Segmenter for the specified language
|
|
|
|
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
|
|
|
|
"""
|
|
|
|
|
|
|
|
search_path = ()
|
|
|
|
if os.environ.get("STANFORD_SEGMENTER"):
|
|
|
|
search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
|
|
|
|
|
|
|
|
# init for Chinese-specific files
|
|
|
|
self._dict = None
|
|
|
|
self._sihan_corpora_dict = None
|
|
|
|
self._sihan_post_processing = "false"
|
|
|
|
|
|
|
|
if lang == "ar":
|
|
|
|
self._java_class = (
|
|
|
|
"edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
|
|
|
|
)
|
|
|
|
model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
|
|
|
|
|
|
|
|
elif lang == "zh":
|
|
|
|
self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
|
|
|
|
model = "pku.gz"
|
|
|
|
self._sihan_post_processing = "true"
|
|
|
|
|
|
|
|
path_to_dict = "dict-chris6.ser.gz"
|
|
|
|
try:
|
|
|
|
self._dict = find_file(
|
|
|
|
path_to_dict,
|
|
|
|
searchpath=search_path,
|
|
|
|
url=_stanford_url,
|
|
|
|
verbose=False,
|
|
|
|
env_vars=("STANFORD_MODELS",),
|
|
|
|
)
|
|
|
|
except LookupError:
|
|
|
|
raise LookupError(
|
|
|
|
"Could not find '%s' (tried using env. "
|
|
|
|
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
|
|
|
|
% path_to_dict
|
|
|
|
)
|
|
|
|
|
|
|
|
sihan_dir = "./data/"
|
|
|
|
try:
|
|
|
|
path_to_sihan_dir = find_dir(
|
|
|
|
sihan_dir,
|
|
|
|
url=_stanford_url,
|
|
|
|
verbose=False,
|
|
|
|
env_vars=("STANFORD_SEGMENTER",),
|
|
|
|
)
|
|
|
|
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
|
|
|
|
except LookupError:
|
|
|
|
raise LookupError(
|
|
|
|
"Could not find '%s' (tried using the "
|
|
|
|
"STANFORD_SEGMENTER environment variable)" % sihan_dir
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
raise LookupError("Unsupported language {}".format(lang))
|
|
|
|
|
|
|
|
try:
|
|
|
|
self._model = find_file(
|
|
|
|
model,
|
|
|
|
searchpath=search_path,
|
|
|
|
url=_stanford_url,
|
|
|
|
verbose=False,
|
|
|
|
env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
|
|
|
|
)
|
|
|
|
except LookupError:
|
|
|
|
raise LookupError(
|
|
|
|
"Could not find '%s' (tried using env. "
|
|
|
|
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
|
|
|
|
)
|
|
|
|
|
|
|
|
def tokenize(self, s):
|
|
|
|
super().tokenize(s)
|
|
|
|
|
|
|
|
def segment_file(self, input_file_path):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
cmd = [
|
|
|
|
self._java_class,
|
|
|
|
"-loadClassifier",
|
|
|
|
self._model,
|
|
|
|
"-keepAllWhitespaces",
|
|
|
|
self._keep_whitespaces,
|
|
|
|
"-textFile",
|
|
|
|
input_file_path,
|
|
|
|
]
|
|
|
|
if self._sihan_corpora_dict is not None:
|
|
|
|
cmd.extend(
|
|
|
|
[
|
|
|
|
"-serDictionary",
|
|
|
|
self._dict,
|
|
|
|
"-sighanCorporaDict",
|
|
|
|
self._sihan_corpora_dict,
|
|
|
|
"-sighanPostProcessing",
|
|
|
|
self._sihan_post_processing,
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
stdout = self._execute(cmd)
|
|
|
|
|
|
|
|
return stdout
|
|
|
|
|
|
|
|
def segment(self, tokens):
|
|
|
|
return self.segment_sents([tokens])
|
|
|
|
|
|
|
|
def segment_sents(self, sentences):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
encoding = self._encoding
|
|
|
|
# Create a temporary input file
|
|
|
|
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
|
|
|
|
|
|
|
|
# Write the actural sentences to the temporary input file
|
|
|
|
_input_fh = os.fdopen(_input_fh, "wb")
|
|
|
|
_input = "\n".join((" ".join(x) for x in sentences))
|
|
|
|
if isinstance(_input, str) and encoding:
|
|
|
|
_input = _input.encode(encoding)
|
|
|
|
_input_fh.write(_input)
|
|
|
|
_input_fh.close()
|
|
|
|
|
|
|
|
cmd = [
|
|
|
|
self._java_class,
|
|
|
|
"-loadClassifier",
|
|
|
|
self._model,
|
|
|
|
"-keepAllWhitespaces",
|
|
|
|
self._keep_whitespaces,
|
|
|
|
"-textFile",
|
|
|
|
self._input_file_path,
|
|
|
|
]
|
|
|
|
if self._sihan_corpora_dict is not None:
|
|
|
|
cmd.extend(
|
|
|
|
[
|
|
|
|
"-serDictionary",
|
|
|
|
self._dict,
|
|
|
|
"-sighanCorporaDict",
|
|
|
|
self._sihan_corpora_dict,
|
|
|
|
"-sighanPostProcessing",
|
|
|
|
self._sihan_post_processing,
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
stdout = self._execute(cmd)
|
|
|
|
|
|
|
|
# Delete the temporary file
|
|
|
|
os.unlink(self._input_file_path)
|
|
|
|
|
|
|
|
return stdout
|
|
|
|
|
|
|
|
def _execute(self, cmd, verbose=False):
|
|
|
|
encoding = self._encoding
|
|
|
|
cmd.extend(["-inputEncoding", encoding])
|
|
|
|
_options_cmd = self._options_cmd
|
|
|
|
if _options_cmd:
|
|
|
|
cmd.extend(["-options", self._options_cmd])
|
|
|
|
|
|
|
|
default_options = " ".join(_java_options)
|
|
|
|
|
|
|
|
# Configure java.
|
|
|
|
config_java(options=self.java_options, verbose=verbose)
|
|
|
|
|
|
|
|
stdout, _stderr = java(
|
|
|
|
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
|
|
|
|
)
|
|
|
|
stdout = stdout.decode(encoding)
|
|
|
|
|
|
|
|
# Return java configurations to their default values.
|
|
|
|
config_java(options=default_options, verbose=False)
|
|
|
|
|
|
|
|
return stdout
|
|
|
|
|
|
|
|
|
|
|
|
def setup_module(module):
|
|
|
|
from nose import SkipTest
|
|
|
|
|
|
|
|
try:
|
|
|
|
seg = StanfordSegmenter()
|
|
|
|
seg.default_config("ar")
|
|
|
|
seg.default_config("zh")
|
|
|
|
except LookupError as e:
|
|
|
|
raise SkipTest(
|
|
|
|
"Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)
|
|
|
|
)
|