You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
666 lines
23 KiB
Python
666 lines
23 KiB
Python
# Natural Language Toolkit: Collections
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
from __future__ import print_function, absolute_import
|
|
|
|
import bisect
|
|
from itertools import islice, chain
|
|
from functools import total_ordering
|
|
# this unused import is for python 2.7
|
|
from collections import defaultdict, deque, Counter
|
|
|
|
from six import text_type
|
|
|
|
from nltk.internals import slice_bounds, raise_unorderable_types
|
|
from nltk.compat import python_2_unicode_compatible
|
|
|
|
|
|
##########################################################################
|
|
# Ordered Dictionary
|
|
##########################################################################
|
|
|
|
|
|
class OrderedDict(dict):
|
|
def __init__(self, data=None, **kwargs):
|
|
self._keys = self.keys(data, kwargs.get('keys'))
|
|
self._default_factory = kwargs.get('default_factory')
|
|
if data is None:
|
|
dict.__init__(self)
|
|
else:
|
|
dict.__init__(self, data)
|
|
|
|
def __delitem__(self, key):
|
|
dict.__delitem__(self, key)
|
|
self._keys.remove(key)
|
|
|
|
def __getitem__(self, key):
|
|
try:
|
|
return dict.__getitem__(self, key)
|
|
except KeyError:
|
|
return self.__missing__(key)
|
|
|
|
def __iter__(self):
|
|
return (key for key in self.keys())
|
|
|
|
def __missing__(self, key):
|
|
if not self._default_factory and key not in self._keys:
|
|
raise KeyError()
|
|
return self._default_factory()
|
|
|
|
def __setitem__(self, key, item):
|
|
dict.__setitem__(self, key, item)
|
|
if key not in self._keys:
|
|
self._keys.append(key)
|
|
|
|
def clear(self):
|
|
dict.clear(self)
|
|
self._keys.clear()
|
|
|
|
def copy(self):
|
|
d = dict.copy(self)
|
|
d._keys = self._keys
|
|
return d
|
|
|
|
def items(self):
|
|
# returns iterator under python 3 and list under python 2
|
|
return zip(self.keys(), self.values())
|
|
|
|
def keys(self, data=None, keys=None):
|
|
if data:
|
|
if keys:
|
|
assert isinstance(keys, list)
|
|
assert len(data) == len(keys)
|
|
return keys
|
|
else:
|
|
assert (
|
|
isinstance(data, dict)
|
|
or isinstance(data, OrderedDict)
|
|
or isinstance(data, list)
|
|
)
|
|
if isinstance(data, dict) or isinstance(data, OrderedDict):
|
|
return data.keys()
|
|
elif isinstance(data, list):
|
|
return [key for (key, value) in data]
|
|
elif '_keys' in self.__dict__:
|
|
return self._keys
|
|
else:
|
|
return []
|
|
|
|
def popitem(self):
|
|
if not self._keys:
|
|
raise KeyError()
|
|
|
|
key = self._keys.pop()
|
|
value = self[key]
|
|
del self[key]
|
|
return (key, value)
|
|
|
|
def setdefault(self, key, failobj=None):
|
|
dict.setdefault(self, key, failobj)
|
|
if key not in self._keys:
|
|
self._keys.append(key)
|
|
|
|
def update(self, data):
|
|
dict.update(self, data)
|
|
for key in self.keys(data):
|
|
if key not in self._keys:
|
|
self._keys.append(key)
|
|
|
|
def values(self):
|
|
# returns iterator under python 3
|
|
return map(self.get, self._keys)
|
|
|
|
|
|
######################################################################
|
|
# Lazy Sequences
|
|
######################################################################
|
|
|
|
|
|
@total_ordering
|
|
@python_2_unicode_compatible
|
|
class AbstractLazySequence(object):
|
|
"""
|
|
An abstract base class for read-only sequences whose values are
|
|
computed as needed. Lazy sequences act like tuples -- they can be
|
|
indexed, sliced, and iterated over; but they may not be modified.
|
|
|
|
The most common application of lazy sequences in NLTK is for
|
|
corpus view objects, which provide access to the contents of a
|
|
corpus without loading the entire corpus into memory, by loading
|
|
pieces of the corpus from disk as needed.
|
|
|
|
The result of modifying a mutable element of a lazy sequence is
|
|
undefined. In particular, the modifications made to the element
|
|
may or may not persist, depending on whether and when the lazy
|
|
sequence caches that element's value or reconstructs it from
|
|
scratch.
|
|
|
|
Subclasses are required to define two methods: ``__len__()``
|
|
and ``iterate_from()``.
|
|
"""
|
|
|
|
def __len__(self):
|
|
"""
|
|
Return the number of tokens in the corpus file underlying this
|
|
corpus view.
|
|
"""
|
|
raise NotImplementedError('should be implemented by subclass')
|
|
|
|
def iterate_from(self, start):
|
|
"""
|
|
Return an iterator that generates the tokens in the corpus
|
|
file underlying this corpus view, starting at the token number
|
|
``start``. If ``start>=len(self)``, then this iterator will
|
|
generate no tokens.
|
|
"""
|
|
raise NotImplementedError('should be implemented by subclass')
|
|
|
|
def __getitem__(self, i):
|
|
"""
|
|
Return the *i* th token in the corpus file underlying this
|
|
corpus view. Negative indices and spans are both supported.
|
|
"""
|
|
if isinstance(i, slice):
|
|
start, stop = slice_bounds(self, i)
|
|
return LazySubsequence(self, start, stop)
|
|
else:
|
|
# Handle negative indices
|
|
if i < 0:
|
|
i += len(self)
|
|
if i < 0:
|
|
raise IndexError('index out of range')
|
|
# Use iterate_from to extract it.
|
|
try:
|
|
return next(self.iterate_from(i))
|
|
except StopIteration:
|
|
raise IndexError('index out of range')
|
|
|
|
def __iter__(self):
|
|
"""Return an iterator that generates the tokens in the corpus
|
|
file underlying this corpus view."""
|
|
return self.iterate_from(0)
|
|
|
|
def count(self, value):
|
|
"""Return the number of times this list contains ``value``."""
|
|
return sum(1 for elt in self if elt == value)
|
|
|
|
def index(self, value, start=None, stop=None):
|
|
"""Return the index of the first occurrence of ``value`` in this
|
|
list that is greater than or equal to ``start`` and less than
|
|
``stop``. Negative start and stop values are treated like negative
|
|
slice bounds -- i.e., they count from the end of the list."""
|
|
start, stop = slice_bounds(self, slice(start, stop))
|
|
for i, elt in enumerate(islice(self, start, stop)):
|
|
if elt == value:
|
|
return i + start
|
|
raise ValueError('index(x): x not in list')
|
|
|
|
def __contains__(self, value):
|
|
"""Return true if this list contains ``value``."""
|
|
return bool(self.count(value))
|
|
|
|
def __add__(self, other):
|
|
"""Return a list concatenating self with other."""
|
|
return LazyConcatenation([self, other])
|
|
|
|
def __radd__(self, other):
|
|
"""Return a list concatenating other with self."""
|
|
return LazyConcatenation([other, self])
|
|
|
|
def __mul__(self, count):
|
|
"""Return a list concatenating self with itself ``count`` times."""
|
|
return LazyConcatenation([self] * count)
|
|
|
|
def __rmul__(self, count):
|
|
"""Return a list concatenating self with itself ``count`` times."""
|
|
return LazyConcatenation([self] * count)
|
|
|
|
_MAX_REPR_SIZE = 60
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation for this corpus view that is
|
|
similar to a list's representation; but if it would be more
|
|
than 60 characters long, it is truncated.
|
|
"""
|
|
pieces = []
|
|
length = 5
|
|
for elt in self:
|
|
pieces.append(repr(elt))
|
|
length += len(pieces[-1]) + 2
|
|
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
|
|
return '[%s, ...]' % text_type(', ').join(pieces[:-1])
|
|
return '[%s]' % text_type(', ').join(pieces)
|
|
|
|
def __eq__(self, other):
|
|
return type(self) == type(other) and list(self) == list(other)
|
|
|
|
def __ne__(self, other):
|
|
return not self == other
|
|
|
|
def __lt__(self, other):
|
|
if type(other) != type(self):
|
|
raise_unorderable_types("<", self, other)
|
|
return list(self) < list(other)
|
|
|
|
def __hash__(self):
|
|
"""
|
|
:raise ValueError: Corpus view objects are unhashable.
|
|
"""
|
|
raise ValueError('%s objects are unhashable' % self.__class__.__name__)
|
|
|
|
|
|
class LazySubsequence(AbstractLazySequence):
|
|
"""
|
|
A subsequence produced by slicing a lazy sequence. This slice
|
|
keeps a reference to its source sequence, and generates its values
|
|
by looking them up in the source sequence.
|
|
"""
|
|
|
|
MIN_SIZE = 100
|
|
"""
|
|
The minimum size for which lazy slices should be created. If
|
|
``LazySubsequence()`` is called with a subsequence that is
|
|
shorter than ``MIN_SIZE``, then a tuple will be returned instead.
|
|
"""
|
|
|
|
def __new__(cls, source, start, stop):
|
|
"""
|
|
Construct a new slice from a given underlying sequence. The
|
|
``start`` and ``stop`` indices should be absolute indices --
|
|
i.e., they should not be negative (for indexing from the back
|
|
of a list) or greater than the length of ``source``.
|
|
"""
|
|
# If the slice is small enough, just use a tuple.
|
|
if stop - start < cls.MIN_SIZE:
|
|
return list(islice(source.iterate_from(start), stop - start))
|
|
else:
|
|
return object.__new__(cls)
|
|
|
|
def __init__(self, source, start, stop):
|
|
self._source = source
|
|
self._start = start
|
|
self._stop = stop
|
|
|
|
def __len__(self):
|
|
return self._stop - self._start
|
|
|
|
def iterate_from(self, start):
|
|
return islice(
|
|
self._source.iterate_from(start + self._start), max(0, len(self) - start)
|
|
)
|
|
|
|
|
|
class LazyConcatenation(AbstractLazySequence):
|
|
"""
|
|
A lazy sequence formed by concatenating a list of lists. This
|
|
underlying list of lists may itself be lazy. ``LazyConcatenation``
|
|
maintains an index that it uses to keep track of the relationship
|
|
between offsets in the concatenated lists and offsets in the
|
|
sublists.
|
|
"""
|
|
|
|
def __init__(self, list_of_lists):
|
|
self._list = list_of_lists
|
|
self._offsets = [0]
|
|
|
|
def __len__(self):
|
|
if len(self._offsets) <= len(self._list):
|
|
for tok in self.iterate_from(self._offsets[-1]):
|
|
pass
|
|
return self._offsets[-1]
|
|
|
|
def iterate_from(self, start_index):
|
|
if start_index < self._offsets[-1]:
|
|
sublist_index = bisect.bisect_right(self._offsets, start_index) - 1
|
|
else:
|
|
sublist_index = len(self._offsets) - 1
|
|
|
|
index = self._offsets[sublist_index]
|
|
|
|
# Construct an iterator over the sublists.
|
|
if isinstance(self._list, AbstractLazySequence):
|
|
sublist_iter = self._list.iterate_from(sublist_index)
|
|
else:
|
|
sublist_iter = islice(self._list, sublist_index, None)
|
|
|
|
for sublist in sublist_iter:
|
|
if sublist_index == (len(self._offsets) - 1):
|
|
assert (
|
|
index + len(sublist) >= self._offsets[-1]
|
|
), 'offests not monotonic increasing!'
|
|
self._offsets.append(index + len(sublist))
|
|
else:
|
|
assert self._offsets[sublist_index + 1] == index + len(
|
|
sublist
|
|
), 'inconsistent list value (num elts)'
|
|
|
|
for value in sublist[max(0, start_index - index) :]:
|
|
yield value
|
|
|
|
index += len(sublist)
|
|
sublist_index += 1
|
|
|
|
|
|
class LazyMap(AbstractLazySequence):
|
|
"""
|
|
A lazy sequence whose elements are formed by applying a given
|
|
function to each element in one or more underlying lists. The
|
|
function is applied lazily -- i.e., when you read a value from the
|
|
list, ``LazyMap`` will calculate that value by applying its
|
|
function to the underlying lists' value(s). ``LazyMap`` is
|
|
essentially a lazy version of the Python primitive function
|
|
``map``. In particular, the following two expressions are
|
|
equivalent:
|
|
|
|
>>> from nltk.collections import LazyMap
|
|
>>> function = str
|
|
>>> sequence = [1,2,3]
|
|
>>> map(function, sequence) # doctest: +SKIP
|
|
['1', '2', '3']
|
|
>>> list(LazyMap(function, sequence))
|
|
['1', '2', '3']
|
|
|
|
Like the Python ``map`` primitive, if the source lists do not have
|
|
equal size, then the value None will be supplied for the
|
|
'missing' elements.
|
|
|
|
Lazy maps can be useful for conserving memory, in cases where
|
|
individual values take up a lot of space. This is especially true
|
|
if the underlying list's values are constructed lazily, as is the
|
|
case with many corpus readers.
|
|
|
|
A typical example of a use case for this class is performing
|
|
feature detection on the tokens in a corpus. Since featuresets
|
|
are encoded as dictionaries, which can take up a lot of memory,
|
|
using a ``LazyMap`` can significantly reduce memory usage when
|
|
training and running classifiers.
|
|
"""
|
|
|
|
def __init__(self, function, *lists, **config):
|
|
"""
|
|
:param function: The function that should be applied to
|
|
elements of ``lists``. It should take as many arguments
|
|
as there are ``lists``.
|
|
:param lists: The underlying lists.
|
|
:param cache_size: Determines the size of the cache used
|
|
by this lazy map. (default=5)
|
|
"""
|
|
if not lists:
|
|
raise TypeError('LazyMap requires at least two args')
|
|
|
|
self._lists = lists
|
|
self._func = function
|
|
self._cache_size = config.get('cache_size', 5)
|
|
self._cache = {} if self._cache_size > 0 else None
|
|
|
|
# If you just take bool() of sum() here _all_lazy will be true just
|
|
# in case n >= 1 list is an AbstractLazySequence. Presumably this
|
|
# isn't what's intended.
|
|
self._all_lazy = sum(
|
|
isinstance(lst, AbstractLazySequence) for lst in lists
|
|
) == len(lists)
|
|
|
|
def iterate_from(self, index):
|
|
# Special case: one lazy sublist
|
|
if len(self._lists) == 1 and self._all_lazy:
|
|
for value in self._lists[0].iterate_from(index):
|
|
yield self._func(value)
|
|
return
|
|
|
|
# Special case: one non-lazy sublist
|
|
elif len(self._lists) == 1:
|
|
while True:
|
|
try:
|
|
yield self._func(self._lists[0][index])
|
|
except IndexError:
|
|
return
|
|
index += 1
|
|
|
|
# Special case: n lazy sublists
|
|
elif self._all_lazy:
|
|
iterators = [lst.iterate_from(index) for lst in self._lists]
|
|
while True:
|
|
elements = []
|
|
for iterator in iterators:
|
|
try:
|
|
elements.append(next(iterator))
|
|
except: # FIXME: What is this except really catching? StopIteration?
|
|
elements.append(None)
|
|
if elements == [None] * len(self._lists):
|
|
return
|
|
yield self._func(*elements)
|
|
index += 1
|
|
|
|
# general case
|
|
else:
|
|
while True:
|
|
try:
|
|
elements = [lst[index] for lst in self._lists]
|
|
except IndexError:
|
|
elements = [None] * len(self._lists)
|
|
for i, lst in enumerate(self._lists):
|
|
try:
|
|
elements[i] = lst[index]
|
|
except IndexError:
|
|
pass
|
|
if elements == [None] * len(self._lists):
|
|
return
|
|
yield self._func(*elements)
|
|
index += 1
|
|
|
|
def __getitem__(self, index):
|
|
if isinstance(index, slice):
|
|
sliced_lists = [lst[index] for lst in self._lists]
|
|
return LazyMap(self._func, *sliced_lists)
|
|
else:
|
|
# Handle negative indices
|
|
if index < 0:
|
|
index += len(self)
|
|
if index < 0:
|
|
raise IndexError('index out of range')
|
|
# Check the cache
|
|
if self._cache is not None and index in self._cache:
|
|
return self._cache[index]
|
|
# Calculate the value
|
|
try:
|
|
val = next(self.iterate_from(index))
|
|
except StopIteration:
|
|
raise IndexError('index out of range')
|
|
# Update the cache
|
|
if self._cache is not None:
|
|
if len(self._cache) > self._cache_size:
|
|
self._cache.popitem() # discard random entry
|
|
self._cache[index] = val
|
|
# Return the value
|
|
return val
|
|
|
|
def __len__(self):
|
|
return max(len(lst) for lst in self._lists)
|
|
|
|
|
|
class LazyZip(LazyMap):
|
|
"""
|
|
A lazy sequence whose elements are tuples, each containing the i-th
|
|
element from each of the argument sequences. The returned list is
|
|
truncated in length to the length of the shortest argument sequence. The
|
|
tuples are constructed lazily -- i.e., when you read a value from the
|
|
list, ``LazyZip`` will calculate that value by forming a tuple from
|
|
the i-th element of each of the argument sequences.
|
|
|
|
``LazyZip`` is essentially a lazy version of the Python primitive function
|
|
``zip``. In particular, an evaluated LazyZip is equivalent to a zip:
|
|
|
|
>>> from nltk.collections import LazyZip
|
|
>>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
|
|
>>> zip(sequence1, sequence2) # doctest: +SKIP
|
|
[(1, 'a'), (2, 'b'), (3, 'c')]
|
|
>>> list(LazyZip(sequence1, sequence2))
|
|
[(1, 'a'), (2, 'b'), (3, 'c')]
|
|
>>> sequences = [sequence1, sequence2, [6,7,8,9]]
|
|
>>> list(zip(*sequences)) == list(LazyZip(*sequences))
|
|
True
|
|
|
|
Lazy zips can be useful for conserving memory in cases where the argument
|
|
sequences are particularly long.
|
|
|
|
A typical example of a use case for this class is combining long sequences
|
|
of gold standard and predicted values in a classification or tagging task
|
|
in order to calculate accuracy. By constructing tuples lazily and
|
|
avoiding the creation of an additional long sequence, memory usage can be
|
|
significantly reduced.
|
|
"""
|
|
|
|
def __init__(self, *lists):
|
|
"""
|
|
:param lists: the underlying lists
|
|
:type lists: list(list)
|
|
"""
|
|
LazyMap.__init__(self, lambda *elts: elts, *lists)
|
|
|
|
def iterate_from(self, index):
|
|
iterator = LazyMap.iterate_from(self, index)
|
|
while index < len(self):
|
|
yield next(iterator)
|
|
index += 1
|
|
return
|
|
|
|
def __len__(self):
|
|
return min(len(lst) for lst in self._lists)
|
|
|
|
|
|
class LazyEnumerate(LazyZip):
|
|
"""
|
|
A lazy sequence whose elements are tuples, each ontaining a count (from
|
|
zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is
|
|
useful for obtaining an indexed list. The tuples are constructed lazily
|
|
-- i.e., when you read a value from the list, ``LazyEnumerate`` will
|
|
calculate that value by forming a tuple from the count of the i-th
|
|
element and the i-th element of the underlying sequence.
|
|
|
|
``LazyEnumerate`` is essentially a lazy version of the Python primitive
|
|
function ``enumerate``. In particular, the following two expressions are
|
|
equivalent:
|
|
|
|
>>> from nltk.collections import LazyEnumerate
|
|
>>> sequence = ['first', 'second', 'third']
|
|
>>> list(enumerate(sequence))
|
|
[(0, 'first'), (1, 'second'), (2, 'third')]
|
|
>>> list(LazyEnumerate(sequence))
|
|
[(0, 'first'), (1, 'second'), (2, 'third')]
|
|
|
|
Lazy enumerations can be useful for conserving memory in cases where the
|
|
argument sequences are particularly long.
|
|
|
|
A typical example of a use case for this class is obtaining an indexed
|
|
list for a long sequence of values. By constructing tuples lazily and
|
|
avoiding the creation of an additional long sequence, memory usage can be
|
|
significantly reduced.
|
|
"""
|
|
|
|
def __init__(self, lst):
|
|
"""
|
|
:param lst: the underlying list
|
|
:type lst: list
|
|
"""
|
|
LazyZip.__init__(self, range(len(lst)), lst)
|
|
|
|
|
|
class LazyIteratorList(AbstractLazySequence):
|
|
"""
|
|
Wraps an iterator, loading its elements on demand
|
|
and making them subscriptable.
|
|
__repr__ displays only the first few elements.
|
|
"""
|
|
|
|
def __init__(self, it, known_len=None):
|
|
self._it = it
|
|
self._len = known_len
|
|
self._cache = []
|
|
|
|
def __len__(self):
|
|
if self._len:
|
|
return self._len
|
|
for x in self.iterate_from(len(self._cache)):
|
|
pass
|
|
self._len = len(self._cache)
|
|
return self._len
|
|
|
|
def iterate_from(self, start):
|
|
"""Create a new iterator over this list starting at the given offset."""
|
|
while len(self._cache) < start:
|
|
v = next(self._it)
|
|
self._cache.append(v)
|
|
i = start
|
|
while i < len(self._cache):
|
|
yield self._cache[i]
|
|
i += 1
|
|
while True:
|
|
v = next(self._it)
|
|
self._cache.append(v)
|
|
yield v
|
|
i += 1
|
|
|
|
def __add__(self, other):
|
|
"""Return a list concatenating self with other."""
|
|
return type(self)(chain(self, other))
|
|
|
|
def __radd__(self, other):
|
|
"""Return a list concatenating other with self."""
|
|
return type(self)(chain(other, self))
|
|
|
|
|
|
######################################################################
|
|
# Trie Implementation
|
|
######################################################################
|
|
class Trie(dict):
|
|
"""A Trie implementation for strings"""
|
|
|
|
LEAF = True
|
|
|
|
def __init__(self, strings=None):
|
|
"""Builds a Trie object, which is built around a ``dict``
|
|
|
|
If ``strings`` is provided, it will add the ``strings``, which
|
|
consist of a ``list`` of ``strings``, to the Trie.
|
|
Otherwise, it'll construct an empty Trie.
|
|
|
|
:param strings: List of strings to insert into the trie
|
|
(Default is ``None``)
|
|
:type strings: list(str)
|
|
|
|
"""
|
|
super(Trie, self).__init__()
|
|
if strings:
|
|
for string in strings:
|
|
self.insert(string)
|
|
|
|
def insert(self, string):
|
|
"""Inserts ``string`` into the Trie
|
|
|
|
:param string: String to insert into the trie
|
|
:type string: str
|
|
|
|
:Example:
|
|
|
|
>>> from nltk.collections import Trie
|
|
>>> trie = Trie(["abc", "def"])
|
|
>>> expected = {'a': {'b': {'c': {True: None}}}, \
|
|
'd': {'e': {'f': {True: None}}}}
|
|
>>> trie == expected
|
|
True
|
|
|
|
"""
|
|
if len(string):
|
|
self[string[0]].insert(string[1:])
|
|
else:
|
|
# mark the string is complete
|
|
self[Trie.LEAF] = None
|
|
|
|
def __missing__(self, key):
|
|
self[key] = Trie()
|
|
return self[key]
|