You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1144 lines
38 KiB
Python
1144 lines
38 KiB
Python
# Natural Language Toolkit: Internal utility functions
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
|
# Edward Loper <edloper@gmail.com>
|
|
# Nitin Madnani <nmadnani@ets.org>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
from __future__ import print_function
|
|
|
|
import subprocess
|
|
import os
|
|
import fnmatch
|
|
import re
|
|
import warnings
|
|
import textwrap
|
|
import types
|
|
import sys
|
|
import stat
|
|
import locale
|
|
|
|
# Use the c version of ElementTree, which is faster, if possible:
|
|
try:
|
|
from xml.etree import cElementTree as ElementTree
|
|
except ImportError:
|
|
from xml.etree import ElementTree
|
|
|
|
from six import string_types
|
|
|
|
from nltk import compat
|
|
|
|
##########################################################################
|
|
# Java Via Command-Line
|
|
##########################################################################
|
|
|
|
_java_bin = None
|
|
_java_options = []
|
|
# [xx] add classpath option to config_java?
|
|
def config_java(bin=None, options=None, verbose=False):
|
|
"""
|
|
Configure nltk's java interface, by letting nltk know where it can
|
|
find the Java binary, and what extra options (if any) should be
|
|
passed to Java when it is run.
|
|
|
|
:param bin: The full path to the Java binary. If not specified,
|
|
then nltk will search the system for a Java binary; and if
|
|
one is not found, it will raise a ``LookupError`` exception.
|
|
:type bin: str
|
|
:param options: A list of options that should be passed to the
|
|
Java binary when it is called. A common value is
|
|
``'-Xmx512m'``, which tells Java binary to increase
|
|
the maximum heap size to 512 megabytes. If no options are
|
|
specified, then do not modify the options list.
|
|
:type options: list(str)
|
|
"""
|
|
global _java_bin, _java_options
|
|
_java_bin = find_binary(
|
|
'java',
|
|
bin,
|
|
env_vars=['JAVAHOME', 'JAVA_HOME'],
|
|
verbose=verbose,
|
|
binary_names=['java.exe'],
|
|
)
|
|
|
|
if options is not None:
|
|
if isinstance(options, string_types):
|
|
options = options.split()
|
|
_java_options = list(options)
|
|
|
|
|
|
def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True):
|
|
"""
|
|
Execute the given java command, by opening a subprocess that calls
|
|
Java. If java has not yet been configured, it will be configured
|
|
by calling ``config_java()`` with no arguments.
|
|
|
|
:param cmd: The java command that should be called, formatted as
|
|
a list of strings. Typically, the first string will be the name
|
|
of the java class; and the remaining strings will be arguments
|
|
for that java class.
|
|
:type cmd: list(str)
|
|
|
|
:param classpath: A ``':'`` separated list of directories, JAR
|
|
archives, and ZIP archives to search for class files.
|
|
:type classpath: str
|
|
|
|
:param stdin, stdout, stderr: Specify the executed programs'
|
|
standard input, standard output and standard error file
|
|
handles, respectively. Valid values are ``subprocess.PIPE``,
|
|
an existing file descriptor (a positive integer), an existing
|
|
file object, 'pipe', 'stdout', 'devnull' and None. ``subprocess.PIPE`` indicates that a
|
|
new pipe to the child should be created. With None, no
|
|
redirection will occur; the child's file handles will be
|
|
inherited from the parent. Additionally, stderr can be
|
|
``subprocess.STDOUT``, which indicates that the stderr data
|
|
from the applications should be captured into the same file
|
|
handle as for stdout.
|
|
|
|
:param blocking: If ``false``, then return immediately after
|
|
spawning the subprocess. In this case, the return value is
|
|
the ``Popen`` object, and not a ``(stdout, stderr)`` tuple.
|
|
|
|
:return: If ``blocking=True``, then return a tuple ``(stdout,
|
|
stderr)``, containing the stdout and stderr outputs generated
|
|
by the java command if the ``stdout`` and ``stderr`` parameters
|
|
were set to ``subprocess.PIPE``; or None otherwise. If
|
|
``blocking=False``, then return a ``subprocess.Popen`` object.
|
|
|
|
:raise OSError: If the java command returns a nonzero return code.
|
|
"""
|
|
|
|
subprocess_output_dict = {'pipe': subprocess.PIPE, 'stdout': subprocess.STDOUT, 'devnull': subprocess.DEVNULL}
|
|
|
|
stdin = subprocess_output_dict.get(stdin, stdin)
|
|
stdout = subprocess_output_dict.get(stdout, stdout)
|
|
stderr = subprocess_output_dict.get(stderr, stderr)
|
|
|
|
if isinstance(cmd, string_types):
|
|
raise TypeError('cmd should be a list of strings')
|
|
|
|
# Make sure we know where a java binary is.
|
|
if _java_bin is None:
|
|
config_java()
|
|
|
|
# Set up the classpath.
|
|
if isinstance(classpath, string_types):
|
|
classpaths = [classpath]
|
|
else:
|
|
classpaths = list(classpath)
|
|
classpath = os.path.pathsep.join(classpaths)
|
|
|
|
# Construct the full command string.
|
|
cmd = list(cmd)
|
|
cmd = ['-cp', classpath] + cmd
|
|
cmd = [_java_bin] + _java_options + cmd
|
|
|
|
# Call java via a subprocess
|
|
p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
|
|
if not blocking:
|
|
return p
|
|
(stdout, stderr) = p.communicate()
|
|
|
|
# Check the return code.
|
|
if p.returncode != 0:
|
|
print(_decode_stdoutdata(stderr))
|
|
raise OSError('Java command failed : ' + str(cmd))
|
|
|
|
return (stdout, stderr)
|
|
|
|
|
|
if 0:
|
|
# config_java(options='-Xmx512m')
|
|
# Write:
|
|
# java('weka.classifiers.bayes.NaiveBayes',
|
|
# ['-d', '/tmp/names.model', '-t', '/tmp/train.arff'],
|
|
# classpath='/Users/edloper/Desktop/weka/weka.jar')
|
|
# Read:
|
|
(a, b) = java(
|
|
[
|
|
'weka.classifiers.bayes.NaiveBayes',
|
|
'-l',
|
|
'/tmp/names.model',
|
|
'-T',
|
|
'/tmp/test.arff',
|
|
'-p',
|
|
'0',
|
|
], # , '-distribution'],
|
|
classpath='/Users/edloper/Desktop/weka/weka.jar',
|
|
)
|
|
|
|
|
|
######################################################################
|
|
# Parsing
|
|
######################################################################
|
|
|
|
|
|
class ReadError(ValueError):
|
|
"""
|
|
Exception raised by read_* functions when they fail.
|
|
:param position: The index in the input string where an error occurred.
|
|
:param expected: What was expected when an error occurred.
|
|
"""
|
|
|
|
def __init__(self, expected, position):
|
|
ValueError.__init__(self, expected, position)
|
|
self.expected = expected
|
|
self.position = position
|
|
|
|
def __str__(self):
|
|
return 'Expected %s at %s' % (self.expected, self.position)
|
|
|
|
|
|
_STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
|
|
|
|
|
|
def read_str(s, start_position):
|
|
"""
|
|
If a Python string literal begins at the specified position in the
|
|
given string, then return a tuple ``(val, end_position)``
|
|
containing the value of the string literal and the position where
|
|
it ends. Otherwise, raise a ``ReadError``.
|
|
|
|
:param s: A string that will be checked to see if within which a
|
|
Python string literal exists.
|
|
:type s: str
|
|
|
|
:param start_position: The specified beginning position of the string ``s``
|
|
to begin regex matching.
|
|
:type start_position: int
|
|
|
|
:return: A tuple containing the matched string literal evaluated as a
|
|
string and the end position of the string literal.
|
|
:rtype: tuple(str, int)
|
|
|
|
:raise ReadError: If the ``_STRING_START_RE`` regex doesn't return a
|
|
match in ``s`` at ``start_position``, i.e., open quote. If the
|
|
``_STRING_END_RE`` regex doesn't return a match in ``s`` at the
|
|
end of the first match, i.e., close quote.
|
|
:raise ValueError: If an invalid string (i.e., contains an invalid
|
|
escape sequence) is passed into the ``eval``.
|
|
|
|
:Example:
|
|
>>> from nltk.internals import read_str
|
|
>>> read_str('"Hello", World!', 0)
|
|
('Hello', 7)
|
|
|
|
"""
|
|
# Read the open quote, and any modifiers.
|
|
m = _STRING_START_RE.match(s, start_position)
|
|
if not m:
|
|
raise ReadError('open quote', start_position)
|
|
quotemark = m.group(1)
|
|
|
|
# Find the close quote.
|
|
_STRING_END_RE = re.compile(r'\\|%s' % quotemark)
|
|
position = m.end()
|
|
while True:
|
|
match = _STRING_END_RE.search(s, position)
|
|
if not match:
|
|
raise ReadError('close quote', position)
|
|
if match.group(0) == '\\':
|
|
position = match.end() + 1
|
|
else:
|
|
break
|
|
|
|
# Process it, using eval. Strings with invalid escape sequences
|
|
# might raise ValueEerror.
|
|
try:
|
|
return eval(s[start_position : match.end()]), match.end()
|
|
except ValueError as e:
|
|
raise ReadError('invalid string (%s)' % e)
|
|
|
|
|
|
_READ_INT_RE = re.compile(r'-?\d+')
|
|
|
|
|
|
def read_int(s, start_position):
|
|
"""
|
|
If an integer begins at the specified position in the given
|
|
string, then return a tuple ``(val, end_position)`` containing the
|
|
value of the integer and the position where it ends. Otherwise,
|
|
raise a ``ReadError``.
|
|
|
|
:param s: A string that will be checked to see if within which a
|
|
Python integer exists.
|
|
:type s: str
|
|
|
|
:param start_position: The specified beginning position of the string ``s``
|
|
to begin regex matching.
|
|
:type start_position: int
|
|
|
|
:return: A tuple containing the matched integer casted to an int,
|
|
and the end position of the int in ``s``.
|
|
:rtype: tuple(int, int)
|
|
|
|
:raise ReadError: If the ``_READ_INT_RE`` regex doesn't return a
|
|
match in ``s`` at ``start_position``.
|
|
|
|
:Example:
|
|
>>> from nltk.internals import read_int
|
|
>>> read_int('42 is the answer', 0)
|
|
(42, 2)
|
|
|
|
"""
|
|
m = _READ_INT_RE.match(s, start_position)
|
|
if not m:
|
|
raise ReadError('integer', start_position)
|
|
return int(m.group()), m.end()
|
|
|
|
|
|
_READ_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
|
|
|
|
|
|
def read_number(s, start_position):
|
|
"""
|
|
If an integer or float begins at the specified position in the
|
|
given string, then return a tuple ``(val, end_position)``
|
|
containing the value of the number and the position where it ends.
|
|
Otherwise, raise a ``ReadError``.
|
|
|
|
:param s: A string that will be checked to see if within which a
|
|
Python number exists.
|
|
:type s: str
|
|
|
|
:param start_position: The specified beginning position of the string ``s``
|
|
to begin regex matching.
|
|
:type start_position: int
|
|
|
|
:return: A tuple containing the matched number casted to a ``float``,
|
|
and the end position of the number in ``s``.
|
|
:rtype: tuple(float, int)
|
|
|
|
:raise ReadError: If the ``_READ_NUMBER_VALUE`` regex doesn't return a
|
|
match in ``s`` at ``start_position``.
|
|
|
|
:Example:
|
|
>>> from nltk.internals import read_number
|
|
>>> read_number('Pi is 3.14159', 6)
|
|
(3.14159, 13)
|
|
|
|
"""
|
|
m = _READ_NUMBER_VALUE.match(s, start_position)
|
|
if not m or not (m.group(1) or m.group(2)):
|
|
raise ReadError('number', start_position)
|
|
if m.group(2):
|
|
return float(m.group()), m.end()
|
|
else:
|
|
return int(m.group()), m.end()
|
|
|
|
|
|
######################################################################
|
|
# Check if a method has been overridden
|
|
######################################################################
|
|
|
|
|
|
def overridden(method):
|
|
"""
|
|
:return: True if ``method`` overrides some method with the same
|
|
name in a base class. This is typically used when defining
|
|
abstract base classes or interfaces, to allow subclasses to define
|
|
either of two related methods:
|
|
|
|
>>> class EaterI:
|
|
... '''Subclass must define eat() or batch_eat().'''
|
|
... def eat(self, food):
|
|
... if overridden(self.batch_eat):
|
|
... return self.batch_eat([food])[0]
|
|
... else:
|
|
... raise NotImplementedError()
|
|
... def batch_eat(self, foods):
|
|
... return [self.eat(food) for food in foods]
|
|
|
|
:type method: instance method
|
|
"""
|
|
# [xx] breaks on classic classes!
|
|
if isinstance(method, types.MethodType) and compat.get_im_class(method) is not None:
|
|
name = method.__name__
|
|
funcs = [
|
|
cls.__dict__[name]
|
|
for cls in _mro(compat.get_im_class(method))
|
|
if name in cls.__dict__
|
|
]
|
|
return len(funcs) > 1
|
|
else:
|
|
raise TypeError('Expected an instance method.')
|
|
|
|
|
|
def _mro(cls):
|
|
"""
|
|
Return the method resolution order for ``cls`` -- i.e., a list
|
|
containing ``cls`` and all its base classes, in the order in which
|
|
they would be checked by ``getattr``. For new-style classes, this
|
|
is just cls.__mro__. For classic classes, this can be obtained by
|
|
a depth-first left-to-right traversal of ``__bases__``.
|
|
"""
|
|
if isinstance(cls, type):
|
|
return cls.__mro__
|
|
else:
|
|
mro = [cls]
|
|
for base in cls.__bases__:
|
|
mro.extend(_mro(base))
|
|
return mro
|
|
|
|
|
|
######################################################################
|
|
# Deprecation decorator & base class
|
|
######################################################################
|
|
# [xx] dedent msg first if it comes from a docstring.
|
|
|
|
|
|
def _add_epytext_field(obj, field, message):
|
|
"""Add an epytext @field to a given object's docstring."""
|
|
indent = ''
|
|
# If we already have a docstring, then add a blank line to separate
|
|
# it from the new field, and check its indentation.
|
|
if obj.__doc__:
|
|
obj.__doc__ = obj.__doc__.rstrip() + '\n\n'
|
|
indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
|
|
if indents:
|
|
indent = min(indents)
|
|
# If we don't have a docstring, add an empty one.
|
|
else:
|
|
obj.__doc__ = ''
|
|
|
|
obj.__doc__ += textwrap.fill(
|
|
'@%s: %s' % (field, message),
|
|
initial_indent=indent,
|
|
subsequent_indent=indent + ' ',
|
|
)
|
|
|
|
|
|
def deprecated(message):
|
|
"""
|
|
A decorator used to mark functions as deprecated. This will cause
|
|
a warning to be printed the when the function is used. Usage:
|
|
|
|
>>> from nltk.internals import deprecated
|
|
>>> @deprecated('Use foo() instead')
|
|
... def bar(x):
|
|
... print(x/10)
|
|
|
|
"""
|
|
|
|
def decorator(func):
|
|
msg = "Function %s() has been deprecated. %s" % (func.__name__, message)
|
|
msg = '\n' + textwrap.fill(msg, initial_indent=' ', subsequent_indent=' ')
|
|
|
|
def newFunc(*args, **kwargs):
|
|
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
|
return func(*args, **kwargs)
|
|
|
|
# Copy the old function's name, docstring, & dict
|
|
newFunc.__dict__.update(func.__dict__)
|
|
newFunc.__name__ = func.__name__
|
|
newFunc.__doc__ = func.__doc__
|
|
newFunc.__deprecated__ = True
|
|
# Add a @deprecated field to the docstring.
|
|
_add_epytext_field(newFunc, 'deprecated', message)
|
|
return newFunc
|
|
|
|
return decorator
|
|
|
|
|
|
class Deprecated(object):
|
|
"""
|
|
A base class used to mark deprecated classes. A typical usage is to
|
|
alert users that the name of a class has changed:
|
|
|
|
>>> from nltk.internals import Deprecated
|
|
>>> class NewClassName(object):
|
|
... pass # All logic goes here.
|
|
...
|
|
>>> class OldClassName(Deprecated, NewClassName):
|
|
... "Use NewClassName instead."
|
|
|
|
The docstring of the deprecated class will be used in the
|
|
deprecation warning message.
|
|
"""
|
|
|
|
def __new__(cls, *args, **kwargs):
|
|
# Figure out which class is the deprecated one.
|
|
dep_cls = None
|
|
for base in _mro(cls):
|
|
if Deprecated in base.__bases__:
|
|
dep_cls = base
|
|
break
|
|
assert dep_cls, 'Unable to determine which base is deprecated.'
|
|
|
|
# Construct an appropriate warning.
|
|
doc = dep_cls.__doc__ or ''.strip()
|
|
# If there's a @deprecated field, strip off the field marker.
|
|
doc = re.sub(r'\A\s*@deprecated:', r'', doc)
|
|
# Strip off any indentation.
|
|
doc = re.sub(r'(?m)^\s*', '', doc)
|
|
# Construct a 'name' string.
|
|
name = 'Class %s' % dep_cls.__name__
|
|
if cls != dep_cls:
|
|
name += ' (base class for %s)' % cls.__name__
|
|
# Put it all together.
|
|
msg = '%s has been deprecated. %s' % (name, doc)
|
|
# Wrap it.
|
|
msg = '\n' + textwrap.fill(msg, initial_indent=' ', subsequent_indent=' ')
|
|
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
|
# Do the actual work of __new__.
|
|
return object.__new__(cls)
|
|
|
|
|
|
##########################################################################
|
|
# COUNTER, FOR UNIQUE NAMING
|
|
##########################################################################
|
|
|
|
|
|
class Counter:
|
|
"""
|
|
A counter that auto-increments each time its value is read.
|
|
"""
|
|
|
|
def __init__(self, initial_value=0):
|
|
self._value = initial_value
|
|
|
|
def get(self):
|
|
self._value += 1
|
|
return self._value
|
|
|
|
|
|
##########################################################################
|
|
# Search for files/binaries
|
|
##########################################################################
|
|
|
|
|
|
def find_file_iter(
|
|
filename,
|
|
env_vars=(),
|
|
searchpath=(),
|
|
file_names=None,
|
|
url=None,
|
|
verbose=False,
|
|
finding_dir=False,
|
|
):
|
|
"""
|
|
Search for a file to be used by nltk.
|
|
|
|
:param filename: The name or path of the file.
|
|
:param env_vars: A list of environment variable names to check.
|
|
:param file_names: A list of alternative file names to check.
|
|
:param searchpath: List of directories to search.
|
|
:param url: URL presented to user for download help.
|
|
:param verbose: Whether or not to print path when a file is found.
|
|
"""
|
|
file_names = [filename] + (file_names or [])
|
|
assert isinstance(filename, string_types)
|
|
assert not isinstance(file_names, string_types)
|
|
assert not isinstance(searchpath, string_types)
|
|
if isinstance(env_vars, string_types):
|
|
env_vars = env_vars.split()
|
|
yielded = False
|
|
|
|
# File exists, no magic
|
|
for alternative in file_names:
|
|
path_to_file = os.path.join(filename, alternative)
|
|
if os.path.isfile(path_to_file):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, path_to_file))
|
|
yielded = True
|
|
yield path_to_file
|
|
# Check the bare alternatives
|
|
if os.path.isfile(alternative):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, alternative))
|
|
yielded = True
|
|
yield alternative
|
|
# Check if the alternative is inside a 'file' directory
|
|
path_to_file = os.path.join(filename, 'file', alternative)
|
|
if os.path.isfile(path_to_file):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, path_to_file))
|
|
yielded = True
|
|
yield path_to_file
|
|
|
|
# Check environment variables
|
|
for env_var in env_vars:
|
|
if env_var in os.environ:
|
|
if finding_dir: # This is to file a directory instead of file
|
|
yielded = True
|
|
yield os.environ[env_var]
|
|
|
|
for env_dir in os.environ[env_var].split(os.pathsep):
|
|
# Check if the environment variable contains a direct path to the bin
|
|
if os.path.isfile(env_dir):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, env_dir))
|
|
yielded = True
|
|
yield env_dir
|
|
# Check if the possible bin names exist inside the environment variable directories
|
|
for alternative in file_names:
|
|
path_to_file = os.path.join(env_dir, alternative)
|
|
if os.path.isfile(path_to_file):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, path_to_file))
|
|
yielded = True
|
|
yield path_to_file
|
|
# Check if the alternative is inside a 'file' directory
|
|
# path_to_file = os.path.join(env_dir, 'file', alternative)
|
|
|
|
# Check if the alternative is inside a 'bin' directory
|
|
path_to_file = os.path.join(env_dir, 'bin', alternative)
|
|
|
|
if os.path.isfile(path_to_file):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, path_to_file))
|
|
yielded = True
|
|
yield path_to_file
|
|
|
|
# Check the path list.
|
|
for directory in searchpath:
|
|
for alternative in file_names:
|
|
path_to_file = os.path.join(directory, alternative)
|
|
if os.path.isfile(path_to_file):
|
|
yielded = True
|
|
yield path_to_file
|
|
|
|
# If we're on a POSIX system, then try using the 'which' command
|
|
# to find the file.
|
|
if os.name == 'posix':
|
|
for alternative in file_names:
|
|
try:
|
|
p = subprocess.Popen(
|
|
['which', alternative],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
)
|
|
stdout, stderr = p.communicate()
|
|
path = _decode_stdoutdata(stdout).strip()
|
|
if path.endswith(alternative) and os.path.exists(path):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, path))
|
|
yielded = True
|
|
yield path
|
|
except (KeyboardInterrupt, SystemExit, OSError):
|
|
raise
|
|
finally:
|
|
pass
|
|
|
|
if not yielded:
|
|
msg = (
|
|
"NLTK was unable to find the %s file!"
|
|
"\nUse software specific "
|
|
"configuration paramaters" % filename
|
|
)
|
|
if env_vars:
|
|
msg += ' or set the %s environment variable' % env_vars[0]
|
|
msg += '.'
|
|
if searchpath:
|
|
msg += '\n\n Searched in:'
|
|
msg += ''.join('\n - %s' % d for d in searchpath)
|
|
if url:
|
|
msg += '\n\n For more information on %s, see:\n <%s>' % (filename, url)
|
|
div = '=' * 75
|
|
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
|
|
|
|
|
|
def find_file(
|
|
filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False
|
|
):
|
|
return next(
|
|
find_file_iter(filename, env_vars, searchpath, file_names, url, verbose)
|
|
)
|
|
|
|
|
|
def find_dir(
|
|
filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False
|
|
):
|
|
return next(
|
|
find_file_iter(
|
|
filename, env_vars, searchpath, file_names, url, verbose, finding_dir=True
|
|
)
|
|
)
|
|
|
|
|
|
def find_binary_iter(
|
|
name,
|
|
path_to_bin=None,
|
|
env_vars=(),
|
|
searchpath=(),
|
|
binary_names=None,
|
|
url=None,
|
|
verbose=False,
|
|
):
|
|
"""
|
|
Search for a file to be used by nltk.
|
|
|
|
:param name: The name or path of the file.
|
|
:param path_to_bin: The user-supplied binary location (deprecated)
|
|
:param env_vars: A list of environment variable names to check.
|
|
:param file_names: A list of alternative file names to check.
|
|
:param searchpath: List of directories to search.
|
|
:param url: URL presented to user for download help.
|
|
:param verbose: Whether or not to print path when a file is found.
|
|
"""
|
|
for file in find_file_iter(
|
|
path_to_bin or name, env_vars, searchpath, binary_names, url, verbose
|
|
):
|
|
yield file
|
|
|
|
|
|
def find_binary(
|
|
name,
|
|
path_to_bin=None,
|
|
env_vars=(),
|
|
searchpath=(),
|
|
binary_names=None,
|
|
url=None,
|
|
verbose=False,
|
|
):
|
|
return next(
|
|
find_binary_iter(
|
|
name, path_to_bin, env_vars, searchpath, binary_names, url, verbose
|
|
)
|
|
)
|
|
|
|
|
|
def find_jar_iter(
|
|
name_pattern,
|
|
path_to_jar=None,
|
|
env_vars=(),
|
|
searchpath=(),
|
|
url=None,
|
|
verbose=False,
|
|
is_regex=False,
|
|
):
|
|
"""
|
|
Search for a jar that is used by nltk.
|
|
|
|
:param name_pattern: The name of the jar file
|
|
:param path_to_jar: The user-supplied jar location, or None.
|
|
:param env_vars: A list of environment variable names to check
|
|
in addition to the CLASSPATH variable which is
|
|
checked by default.
|
|
:param searchpath: List of directories to search.
|
|
:param is_regex: Whether name is a regular expression.
|
|
"""
|
|
|
|
assert isinstance(name_pattern, string_types)
|
|
assert not isinstance(searchpath, string_types)
|
|
if isinstance(env_vars, string_types):
|
|
env_vars = env_vars.split()
|
|
yielded = False
|
|
|
|
# Make sure we check the CLASSPATH first
|
|
env_vars = ['CLASSPATH'] + list(env_vars)
|
|
|
|
# If an explicit location was given, then check it, and yield it if
|
|
# it's present; otherwise, complain.
|
|
if path_to_jar is not None:
|
|
if os.path.isfile(path_to_jar):
|
|
yielded = True
|
|
yield path_to_jar
|
|
else:
|
|
raise LookupError(
|
|
'Could not find %s jar file at %s' % (name_pattern, path_to_jar)
|
|
)
|
|
|
|
# Check environment variables
|
|
for env_var in env_vars:
|
|
if env_var in os.environ:
|
|
if env_var == 'CLASSPATH':
|
|
classpath = os.environ['CLASSPATH']
|
|
for cp in classpath.split(os.path.pathsep):
|
|
if os.path.isfile(cp):
|
|
filename = os.path.basename(cp)
|
|
if (
|
|
is_regex
|
|
and re.match(name_pattern, filename)
|
|
or (not is_regex and filename == name_pattern)
|
|
):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (name_pattern, cp))
|
|
yielded = True
|
|
yield cp
|
|
# The case where user put directory containing the jar file in the classpath
|
|
if os.path.isdir(cp):
|
|
if not is_regex:
|
|
if os.path.isfile(os.path.join(cp, name_pattern)):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (name_pattern, cp))
|
|
yielded = True
|
|
yield os.path.join(cp, name_pattern)
|
|
else:
|
|
# Look for file using regular expression
|
|
for file_name in os.listdir(cp):
|
|
if re.match(name_pattern, file_name):
|
|
if verbose:
|
|
print(
|
|
'[Found %s: %s]'
|
|
% (
|
|
name_pattern,
|
|
os.path.join(cp, file_name),
|
|
)
|
|
)
|
|
yielded = True
|
|
yield os.path.join(cp, file_name)
|
|
|
|
else:
|
|
jar_env = os.environ[env_var]
|
|
jar_iter = (
|
|
(
|
|
os.path.join(jar_env, path_to_jar)
|
|
for path_to_jar in os.listdir(jar_env)
|
|
)
|
|
if os.path.isdir(jar_env)
|
|
else (jar_env,)
|
|
)
|
|
for path_to_jar in jar_iter:
|
|
if os.path.isfile(path_to_jar):
|
|
filename = os.path.basename(path_to_jar)
|
|
if (
|
|
is_regex
|
|
and re.match(name_pattern, filename)
|
|
or (not is_regex and filename == name_pattern)
|
|
):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (name_pattern, path_to_jar))
|
|
yielded = True
|
|
yield path_to_jar
|
|
|
|
# Check the path list.
|
|
for directory in searchpath:
|
|
if is_regex:
|
|
for filename in os.listdir(directory):
|
|
path_to_jar = os.path.join(directory, filename)
|
|
if os.path.isfile(path_to_jar):
|
|
if re.match(name_pattern, filename):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (filename, path_to_jar))
|
|
yielded = True
|
|
yield path_to_jar
|
|
else:
|
|
path_to_jar = os.path.join(directory, name_pattern)
|
|
if os.path.isfile(path_to_jar):
|
|
if verbose:
|
|
print('[Found %s: %s]' % (name_pattern, path_to_jar))
|
|
yielded = True
|
|
yield path_to_jar
|
|
|
|
if not yielded:
|
|
# If nothing was found, raise an error
|
|
msg = "NLTK was unable to find %s!" % name_pattern
|
|
if env_vars:
|
|
msg += ' Set the %s environment variable' % env_vars[0]
|
|
msg = textwrap.fill(msg + '.', initial_indent=' ', subsequent_indent=' ')
|
|
if searchpath:
|
|
msg += '\n\n Searched in:'
|
|
msg += ''.join('\n - %s' % d for d in searchpath)
|
|
if url:
|
|
msg += '\n\n For more information, on %s, see:\n <%s>' % (
|
|
name_pattern,
|
|
url,
|
|
)
|
|
div = '=' * 75
|
|
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
|
|
|
|
|
|
def find_jar(
|
|
name_pattern,
|
|
path_to_jar=None,
|
|
env_vars=(),
|
|
searchpath=(),
|
|
url=None,
|
|
verbose=False,
|
|
is_regex=False,
|
|
):
|
|
return next(
|
|
find_jar_iter(
|
|
name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex
|
|
)
|
|
)
|
|
|
|
|
|
def find_jars_within_path(path_to_jars):
|
|
return [
|
|
os.path.join(root, filename)
|
|
for root, dirnames, filenames in os.walk(path_to_jars)
|
|
for filename in fnmatch.filter(filenames, '*.jar')
|
|
]
|
|
|
|
|
|
def _decode_stdoutdata(stdoutdata):
|
|
""" Convert data read from stdout/stderr to unicode """
|
|
if not isinstance(stdoutdata, bytes):
|
|
return stdoutdata
|
|
|
|
encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding())
|
|
if encoding is None:
|
|
return stdoutdata.decode()
|
|
return stdoutdata.decode(encoding)
|
|
|
|
|
|
##########################################################################
|
|
# Import Stdlib Module
|
|
##########################################################################
|
|
|
|
|
|
def import_from_stdlib(module):
|
|
"""
|
|
When python is run from within the nltk/ directory tree, the
|
|
current directory is included at the beginning of the search path.
|
|
Unfortunately, that means that modules within nltk can sometimes
|
|
shadow standard library modules. As an example, the stdlib
|
|
'inspect' module will attempt to import the stdlib 'tokenize'
|
|
module, but will instead end up importing NLTK's 'tokenize' module
|
|
instead (causing the import to fail).
|
|
"""
|
|
old_path = sys.path
|
|
sys.path = [d for d in sys.path if d not in ('', '.')]
|
|
m = __import__(module)
|
|
sys.path = old_path
|
|
return m
|
|
|
|
|
|
##########################################################################
|
|
# Wrapper for ElementTree Elements
|
|
##########################################################################
|
|
|
|
|
|
@compat.python_2_unicode_compatible
|
|
class ElementWrapper(object):
|
|
"""
|
|
A wrapper around ElementTree Element objects whose main purpose is
|
|
to provide nicer __repr__ and __str__ methods. In addition, any
|
|
of the wrapped Element's methods that return other Element objects
|
|
are overridden to wrap those values before returning them.
|
|
|
|
This makes Elements more convenient to work with in
|
|
interactive sessions and doctests, at the expense of some
|
|
efficiency.
|
|
"""
|
|
|
|
# Prevent double-wrapping:
|
|
def __new__(cls, etree):
|
|
"""
|
|
Create and return a wrapper around a given Element object.
|
|
If ``etree`` is an ``ElementWrapper``, then ``etree`` is
|
|
returned as-is.
|
|
"""
|
|
if isinstance(etree, ElementWrapper):
|
|
return etree
|
|
else:
|
|
return object.__new__(ElementWrapper)
|
|
|
|
def __init__(self, etree):
|
|
r"""
|
|
Initialize a new Element wrapper for ``etree``.
|
|
|
|
If ``etree`` is a string, then it will be converted to an
|
|
Element object using ``ElementTree.fromstring()`` first:
|
|
|
|
>>> ElementWrapper("<test></test>")
|
|
<Element "<?xml version='1.0' encoding='utf8'?>\n<test />">
|
|
|
|
"""
|
|
if isinstance(etree, string_types):
|
|
etree = ElementTree.fromstring(etree)
|
|
self.__dict__['_etree'] = etree
|
|
|
|
def unwrap(self):
|
|
"""
|
|
Return the Element object wrapped by this wrapper.
|
|
"""
|
|
return self._etree
|
|
|
|
##////////////////////////////////////////////////////////////
|
|
# { String Representation
|
|
##////////////////////////////////////////////////////////////
|
|
|
|
def __repr__(self):
|
|
s = ElementTree.tostring(self._etree, encoding='utf8').decode('utf8')
|
|
if len(s) > 60:
|
|
e = s.rfind('<')
|
|
if (len(s) - e) > 30:
|
|
e = -20
|
|
s = '%s...%s' % (s[:30], s[e:])
|
|
return '<Element %r>' % s
|
|
|
|
def __str__(self):
|
|
"""
|
|
:return: the result of applying ``ElementTree.tostring()`` to
|
|
the wrapped Element object.
|
|
"""
|
|
return (
|
|
ElementTree.tostring(self._etree, encoding='utf8').decode('utf8').rstrip()
|
|
)
|
|
|
|
##////////////////////////////////////////////////////////////
|
|
# { Element interface Delegation (pass-through)
|
|
##////////////////////////////////////////////////////////////
|
|
|
|
def __getattr__(self, attrib):
|
|
return getattr(self._etree, attrib)
|
|
|
|
def __setattr__(self, attr, value):
|
|
return setattr(self._etree, attr, value)
|
|
|
|
def __delattr__(self, attr):
|
|
return delattr(self._etree, attr)
|
|
|
|
def __setitem__(self, index, element):
|
|
self._etree[index] = element
|
|
|
|
def __delitem__(self, index):
|
|
del self._etree[index]
|
|
|
|
def __setslice__(self, start, stop, elements):
|
|
self._etree[start:stop] = elements
|
|
|
|
def __delslice__(self, start, stop):
|
|
del self._etree[start:stop]
|
|
|
|
def __len__(self):
|
|
return len(self._etree)
|
|
|
|
##////////////////////////////////////////////////////////////
|
|
# { Element interface Delegation (wrap result)
|
|
##////////////////////////////////////////////////////////////
|
|
|
|
def __getitem__(self, index):
|
|
return ElementWrapper(self._etree[index])
|
|
|
|
def __getslice__(self, start, stop):
|
|
return [ElementWrapper(elt) for elt in self._etree[start:stop]]
|
|
|
|
def getchildren(self):
|
|
return [ElementWrapper(elt) for elt in self._etree]
|
|
|
|
def getiterator(self, tag=None):
|
|
return (ElementWrapper(elt) for elt in self._etree.getiterator(tag))
|
|
|
|
def makeelement(self, tag, attrib):
|
|
return ElementWrapper(self._etree.makeelement(tag, attrib))
|
|
|
|
def find(self, path):
|
|
elt = self._etree.find(path)
|
|
if elt is None:
|
|
return elt
|
|
else:
|
|
return ElementWrapper(elt)
|
|
|
|
def findall(self, path):
|
|
return [ElementWrapper(elt) for elt in self._etree.findall(path)]
|
|
|
|
|
|
######################################################################
|
|
# Helper for Handling Slicing
|
|
######################################################################
|
|
|
|
|
|
def slice_bounds(sequence, slice_obj, allow_step=False):
|
|
"""
|
|
Given a slice, return the corresponding (start, stop) bounds,
|
|
taking into account None indices and negative indices. The
|
|
following guarantees are made for the returned start and stop values:
|
|
|
|
- 0 <= start <= len(sequence)
|
|
- 0 <= stop <= len(sequence)
|
|
- start <= stop
|
|
|
|
:raise ValueError: If ``slice_obj.step`` is not None.
|
|
:param allow_step: If true, then the slice object may have a
|
|
non-None step. If it does, then return a tuple
|
|
(start, stop, step).
|
|
"""
|
|
start, stop = (slice_obj.start, slice_obj.stop)
|
|
|
|
# If allow_step is true, then include the step in our return
|
|
# value tuple.
|
|
if allow_step:
|
|
step = slice_obj.step
|
|
if step is None:
|
|
step = 1
|
|
# Use a recursive call without allow_step to find the slice
|
|
# bounds. If step is negative, then the roles of start and
|
|
# stop (in terms of default values, etc), are swapped.
|
|
if step < 0:
|
|
start, stop = slice_bounds(sequence, slice(stop, start))
|
|
else:
|
|
start, stop = slice_bounds(sequence, slice(start, stop))
|
|
return start, stop, step
|
|
|
|
# Otherwise, make sure that no non-default step value is used.
|
|
elif slice_obj.step not in (None, 1):
|
|
raise ValueError(
|
|
'slices with steps are not supported by %s' % sequence.__class__.__name__
|
|
)
|
|
|
|
# Supply default offsets.
|
|
if start is None:
|
|
start = 0
|
|
if stop is None:
|
|
stop = len(sequence)
|
|
|
|
# Handle negative indices.
|
|
if start < 0:
|
|
start = max(0, len(sequence) + start)
|
|
if stop < 0:
|
|
stop = max(0, len(sequence) + stop)
|
|
|
|
# Make sure stop doesn't go past the end of the list. Note that
|
|
# we avoid calculating len(sequence) if possible, because for lazy
|
|
# sequences, calculating the length of a sequence can be expensive.
|
|
if stop > 0:
|
|
try:
|
|
sequence[stop - 1]
|
|
except IndexError:
|
|
stop = len(sequence)
|
|
|
|
# Make sure start isn't past stop.
|
|
start = min(start, stop)
|
|
|
|
# That's all folks!
|
|
return start, stop
|
|
|
|
|
|
######################################################################
|
|
# Permission Checking
|
|
######################################################################
|
|
|
|
|
|
def is_writable(path):
|
|
# Ensure that it exists.
|
|
if not os.path.exists(path):
|
|
return False
|
|
|
|
# If we're on a posix system, check its permissions.
|
|
if hasattr(os, 'getuid'):
|
|
statdata = os.stat(path)
|
|
perm = stat.S_IMODE(statdata.st_mode)
|
|
# is it world-writable?
|
|
if perm & 0o002:
|
|
return True
|
|
# do we own it?
|
|
elif statdata.st_uid == os.getuid() and (perm & 0o200):
|
|
return True
|
|
# are we in a group that can write to it?
|
|
elif (statdata.st_gid in [os.getgid()] + os.getgroups()) and (perm & 0o020):
|
|
return True
|
|
# otherwise, we can't write to it.
|
|
else:
|
|
return False
|
|
|
|
# Otherwise, we'll assume it's writable.
|
|
# [xx] should we do other checks on other platforms?
|
|
return True
|
|
|
|
|
|
######################################################################
|
|
# NLTK Error reporting
|
|
######################################################################
|
|
|
|
|
|
def raise_unorderable_types(ordering, a, b):
|
|
raise TypeError(
|
|
"unorderable types: %s() %s %s()"
|
|
% (type(a).__name__, ordering, type(b).__name__)
|
|
)
|