You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

315 lines
9.7 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Segmenter
# for Chinese and Arabic
#
# Copyright (C) 2001-2019 NLTK Project
# Author: 52nlp <52nlpcn@gmail.com>
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
# Alex Constantin <alex@keyworder.ch>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals, print_function
import tempfile
import os
import json
import warnings
from subprocess import PIPE
from six import text_type
from nltk import compat
from nltk.internals import (
find_jar,
find_file,
find_dir,
config_java,
java,
_java_options,
)
from nltk.tokenize.api import TokenizerI
_stanford_url = 'https://nlp.stanford.edu/software'
class StanfordSegmenter(TokenizerI):
"""Interface to the Stanford Segmenter
If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
should be provieded, for example::
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> seg = StanfordSegmenter()
>>> seg.default_config('zh')
>>> sent = u'这是斯坦福中文分词器测试'
>>> print(seg.segment(sent))
\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
<BLANKLINE>
>>> seg.default_config('ar')
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
>>> print(seg.segment(sent.split()))
\u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
<BLANKLINE>
"""
_JAR = 'stanford-segmenter.jar'
def __init__(
self,
path_to_jar=None,
path_to_slf4j=None,
java_class=None,
path_to_model=None,
path_to_dict=None,
path_to_sihan_corpora_dict=None,
sihan_post_processing='false',
keep_whitespaces='false',
encoding='UTF-8',
options=None,
verbose=False,
java_options='-mx2g',
):
# Raise deprecation warning.
warnings.simplefilter('always', DeprecationWarning)
warnings.warn(
str(
"\nThe StanfordTokenizer will "
"be deprecated in version 3.2.5.\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
),
DeprecationWarning,
stacklevel=2,
)
warnings.simplefilter('ignore', DeprecationWarning)
stanford_segmenter = find_jar(
self._JAR,
path_to_jar,
env_vars=('STANFORD_SEGMENTER',),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
if path_to_slf4j is not None:
slf4j = find_jar(
'slf4j-api.jar',
path_to_slf4j,
env_vars=('SLF4J', 'STANFORD_SEGMENTER'),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
else:
slf4j = None
# This is passed to java as the -cp option, the old version of segmenter needs slf4j.
# The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
self._stanford_jar = os.pathsep.join(
_ for _ in [stanford_segmenter, slf4j] if _ is not None
)
self._java_class = java_class
self._model = path_to_model
self._sihan_corpora_dict = path_to_sihan_corpora_dict
self._sihan_post_processing = sihan_post_processing
self._keep_whitespaces = keep_whitespaces
self._dict = path_to_dict
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ','.join(
'{0}={1}'.format(key, json.dumps(val)) for key, val in options.items()
)
def default_config(self, lang):
"""
Attempt to intialize Stanford Word Segmenter for the specified language
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
"""
search_path = ()
if os.environ.get('STANFORD_SEGMENTER'):
search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
# init for Chinese-specific files
self._dict = None
self._sihan_corpora_dict = None
self._sihan_post_processing = 'false'
if lang == 'ar':
self._java_class = (
'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
)
model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
elif lang == 'zh':
self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
model = 'pku.gz'
self._sihan_post_processing = 'true'
path_to_dict = 'dict-chris6.ser.gz'
try:
self._dict = find_file(
path_to_dict,
searchpath=search_path,
url=_stanford_url,
verbose=False,
env_vars=('STANFORD_MODELS',),
)
except LookupError:
raise LookupError(
"Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
% path_to_dict
)
sihan_dir = './data/'
try:
path_to_sihan_dir = find_dir(
sihan_dir,
url=_stanford_url,
verbose=False,
env_vars=('STANFORD_SEGMENTER',),
)
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
except LookupError:
raise LookupError(
"Could not find '%s' (tried using the "
"STANFORD_SEGMENTER environment variable)" % sihan_dir
)
else:
raise LookupError("Unsupported language {}".format(lang))
try:
self._model = find_file(
model,
searchpath=search_path,
url=_stanford_url,
verbose=False,
env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER'),
)
except LookupError:
raise LookupError(
"Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
)
def tokenize(self, s):
super().tokenize(s)
def segment_file(self, input_file_path):
"""
"""
cmd = [
self._java_class,
'-loadClassifier',
self._model,
'-keepAllWhitespaces',
self._keep_whitespaces,
'-textFile',
input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
'-serDictionary',
self._dict,
'-sighanCorporaDict',
self._sihan_corpora_dict,
'-sighanPostProcessing',
self._sihan_post_processing,
]
)
stdout = self._execute(cmd)
return stdout
def segment(self, tokens):
return self.segment_sents([tokens])
def segment_sents(self, sentences):
"""
"""
encoding = self._encoding
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
self._java_class,
'-loadClassifier',
self._model,
'-keepAllWhitespaces',
self._keep_whitespaces,
'-textFile',
self._input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
'-serDictionary',
self._dict,
'-sighanCorporaDict',
self._sihan_corpora_dict,
'-sighanPostProcessing',
self._sihan_post_processing,
]
)
stdout = self._execute(cmd)
# Delete the temporary file
os.unlink(self._input_file_path)
return stdout
def _execute(self, cmd, verbose=False):
encoding = self._encoding
cmd.extend(['-inputEncoding', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(['-options', self._options_cmd])
default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
stdout, _stderr = java(
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
)
stdout = stdout.decode(encoding)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
def setup_module(module):
from nose import SkipTest
try:
seg = StanfordSegmenter()
seg.default_config('ar')
seg.default_config('zh')
except LookupError as e:
raise SkipTest(
'Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e)
)