bo-graduation/venv/lib/python3.7/site-packages/nltk/lm/vocabulary.py

# Natural Language Toolkit
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Language Model Vocabulary"""

import sys
from collections import Counter
from collections.abc import Iterable
from itertools import chain
from functools import singledispatch


@singledispatch
def _dispatched_lookup(words, vocab):
    raise TypeError(
        "Unsupported type for looking up in vocabulary: {0}".format(type(words))
    )


@_dispatched_lookup.register(Iterable)
def _(words, vocab):
    """Look up a sequence of words in the vocabulary.

    Returns an iterator over looked up words.

    """
    return tuple(_dispatched_lookup(w, vocab) for w in words)


@_dispatched_lookup.register(str)
def _string_lookup(word, vocab):
    """Looks up one word in the vocabulary."""
    return word if word in vocab else vocab.unk_label


class Vocabulary:
    """Stores language model vocabulary.

    Satisfies two common language modeling requirements for a vocabulary:
    - When checking membership and calculating its size, filters items
      by comparing their counts to a cutoff value.
    - Adds a special "unknown" token which unseen words are mapped to.

    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
    >>> from nltk.lm import Vocabulary
    >>> vocab = Vocabulary(words, unk_cutoff=2)

    Tokens with counts greater than or equal to the cutoff value will
    be considered part of the vocabulary.

    >>> vocab['c']
    3
    >>> 'c' in vocab
    True
    >>> vocab['d']
    2
    >>> 'd' in vocab
    True

    Tokens with frequency counts less than the cutoff value will be considered not
    part of the vocabulary even though their entries in the count dictionary are
    preserved.

    >>> vocab['b']
    1
    >>> 'b' in vocab
    False
    >>> vocab['aliens']
    0
    >>> 'aliens' in vocab
    False

    Keeping the count entries for seen words allows us to change the cutoff value
    without having to recalculate the counts.

    >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
    >>> "b" in vocab2
    True

    The cutoff value influences not only membership checking but also the result of
    getting the size of the vocabulary using the built-in `len`.
    Note that while the number of keys in the vocabulary's counter stays the same,
    the items in the vocabulary differ depending on the cutoff.
    We use `sorted` to demonstrate because it keeps the order consistent.

    >>> sorted(vocab2.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab2)
    ['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab)
    ['<UNK>', 'a', 'c', 'd']

    In addition to items it gets populated with, the vocabulary stores a special
    token that stands in for so-called "unknown" items. By default it's "<UNK>".

    >>> "<UNK>" in vocab
    True

    We can look up words in a vocabulary using its `lookup` method.
    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
    If given one word (a string) as an input, this method will return a string.

    >>> vocab.lookup("a")
    'a'
    >>> vocab.lookup("aliens")
    '<UNK>'

    If given a sequence, it will return an tuple of the looked up words.

    >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
    ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')

    It's possible to update the counts after the vocabulary has been created.
    In general, the interface is the same as that of `collections.Counter`.

    >>> vocab['b']
    1
    >>> vocab.update(["b", "b", "c"])
    >>> vocab['b']
    3
    """

    def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):
        """Create a new Vocabulary.

        :param counts: Optional iterable or `collections.Counter` instance to
                       pre-seed the Vocabulary. In case it is iterable, counts
                       are calculated.
        :param int unk_cutoff: Words that occur less frequently than this value
                               are not considered part of the vocabulary.
        :param unk_label: Label for marking words not part of vocabulary.

        """
        if isinstance(counts, Counter):
            self.counts = counts
        else:
            self.counts = Counter()
            if isinstance(counts, Iterable):
                self.counts.update(counts)
        self.unk_label = unk_label
        if unk_cutoff < 1:
            raise ValueError(
                "Cutoff value cannot be less than 1. Got: {0}".format(unk_cutoff)
            )
        self._cutoff = unk_cutoff

    @property
    def cutoff(self):
        """Cutoff value.

        Items with count below this value are not considered part of vocabulary.

        """
        return self._cutoff

    def update(self, *counter_args, **counter_kwargs):
        """Update vocabulary counts.

        Wraps `collections.Counter.update` method.

        """
        self.counts.update(*counter_args, **counter_kwargs)

    def lookup(self, words):
        """Look up one or more words in the vocabulary.

        If passed one word as a string will return that word or `self.unk_label`.
        Otherwise will assume it was passed a sequence of words, will try to look
        each of them up and return an iterator over the looked up words.

        :param words: Word(s) to look up.
        :type words: Iterable(str) or str
        :rtype: generator(str) or str
        :raises: TypeError for types other than strings or iterables

        >>> from nltk.lm import Vocabulary
        >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
        >>> vocab.lookup("a")
        'a'
        >>> vocab.lookup("aliens")
        '<UNK>'
        >>> vocab.lookup(["a", "b", "c", ["x", "b"]])
        ('a', 'b', '<UNK>', ('<UNK>', 'b'))

        """
        return _dispatched_lookup(words, self)

    def __getitem__(self, item):
        return self._cutoff if item == self.unk_label else self.counts[item]

    def __contains__(self, item):
        """Only consider items with counts GE to cutoff as being in the
        vocabulary."""
        return self[item] >= self.cutoff

    def __iter__(self):
        """Building on membership check define how to iterate over
        vocabulary."""
        return chain(
            (item for item in self.counts if item in self),
            [self.unk_label] if self.counts else [],
        )

    def __len__(self):
        """Computing size of vocabulary reflects the cutoff."""
        return sum(1 for _ in self)

    def __eq__(self, other):
        return (
            self.unk_label == other.unk_label
            and self.cutoff == other.cutoff
            and self.counts == other.counts
        )

    def __str__(self):
        return "<{0} with cutoff={1} unk_label='{2}' and {3} items>".format(
            self.__class__.__name__, self.cutoff, self.unk_label, len(self)
        )
readme check 5 years ago			`# Natural Language Toolkit`
			`#`
add tag_comparison_v3.py 5 years ago			`# Copyright (C) 2001-2020 NLTK Project`
readme check 5 years ago			`# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`
			`"""Language Model Vocabulary"""`

			`import sys`
add tag_comparison_v3.py 5 years ago			`from collections import Counter`
			`from collections.abc import Iterable`
readme check 5 years ago			`from itertools import chain`
add tag_comparison_v3.py 5 years ago			`from functools import singledispatch`
readme check 5 years ago

			`@singledispatch`
			`def _dispatched_lookup(words, vocab):`
			`raise TypeError(`
			`"Unsupported type for looking up in vocabulary: {0}".format(type(words))`
			`)`


			`@_dispatched_lookup.register(Iterable)`
			`def _(words, vocab):`
			`"""Look up a sequence of words in the vocabulary.`

			`Returns an iterator over looked up words.`

			`"""`
			`return tuple(_dispatched_lookup(w, vocab) for w in words)`


add tag_comparison_v3.py 5 years ago			`@_dispatched_lookup.register(str)`
readme check 5 years ago			`def _string_lookup(word, vocab):`
			`"""Looks up one word in the vocabulary."""`
			`return word if word in vocab else vocab.unk_label`


add tag_comparison_v3.py 5 years ago			`class Vocabulary:`
readme check 5 years ago			`"""Stores language model vocabulary.`

			`Satisfies two common language modeling requirements for a vocabulary:`
			`- When checking membership and calculating its size, filters items`
			`by comparing their counts to a cutoff value.`
			`- Adds a special "unknown" token which unseen words are mapped to.`

			`>>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']`
			`>>> from nltk.lm import Vocabulary`
			`>>> vocab = Vocabulary(words, unk_cutoff=2)`

			`Tokens with counts greater than or equal to the cutoff value will`
			`be considered part of the vocabulary.`

			`>>> vocab['c']`
			`3`
			`>>> 'c' in vocab`
			`True`
			`>>> vocab['d']`
			`2`
			`>>> 'd' in vocab`
			`True`

			`Tokens with frequency counts less than the cutoff value will be considered not`
			`part of the vocabulary even though their entries in the count dictionary are`
			`preserved.`

			`>>> vocab['b']`
			`1`
			`>>> 'b' in vocab`
			`False`
			`>>> vocab['aliens']`
			`0`
			`>>> 'aliens' in vocab`
			`False`

			`Keeping the count entries for seen words allows us to change the cutoff value`
			`without having to recalculate the counts.`

			`>>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)`
			`>>> "b" in vocab2`
			`True`

			`The cutoff value influences not only membership checking but also the result of`
			getting the size of the vocabulary using the built-in `len`.
			`Note that while the number of keys in the vocabulary's counter stays the same,`
			`the items in the vocabulary differ depending on the cutoff.`
			We use `sorted` to demonstrate because it keeps the order consistent.

			`>>> sorted(vocab2.counts)`
			`['-', 'a', 'b', 'c', 'd', 'r']`
			`>>> sorted(vocab2)`
			`['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']`
			`>>> sorted(vocab.counts)`
			`['-', 'a', 'b', 'c', 'd', 'r']`
			`>>> sorted(vocab)`
			`['<UNK>', 'a', 'c', 'd']`

			`In addition to items it gets populated with, the vocabulary stores a special`
			`token that stands in for so-called "unknown" items. By default it's "<UNK>".`

			`>>> "<UNK>" in vocab`
			`True`

			We can look up words in a vocabulary using its `lookup` method.
			`"Unseen" words (with counts less than cutoff) are looked up as the unknown label.`
			`If given one word (a string) as an input, this method will return a string.`

			`>>> vocab.lookup("a")`
			`'a'`
			`>>> vocab.lookup("aliens")`
			`'<UNK>'`

			`If given a sequence, it will return an tuple of the looked up words.`

			`>>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])`
			`('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')`

			`It's possible to update the counts after the vocabulary has been created.`
add tag_comparison_v3.py 5 years ago			In general, the interface is the same as that of `collections.Counter`.
readme check 5 years ago
			`>>> vocab['b']`
			`1`
			`>>> vocab.update(["b", "b", "c"])`
			`>>> vocab['b']`
			`3`
			`"""`

			`def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):`
			`"""Create a new Vocabulary.`

			:param counts: Optional iterable or `collections.Counter` instance to
			`pre-seed the Vocabulary. In case it is iterable, counts`
			`are calculated.`
			`:param int unk_cutoff: Words that occur less frequently than this value`
			`are not considered part of the vocabulary.`
			`:param unk_label: Label for marking words not part of vocabulary.`

			`"""`
			`if isinstance(counts, Counter):`
			`self.counts = counts`
			`else:`
			`self.counts = Counter()`
			`if isinstance(counts, Iterable):`
			`self.counts.update(counts)`
			`self.unk_label = unk_label`
			`if unk_cutoff < 1:`
			`raise ValueError(`
			`"Cutoff value cannot be less than 1. Got: {0}".format(unk_cutoff)`
			`)`
			`self._cutoff = unk_cutoff`

			`@property`
			`def cutoff(self):`
			`"""Cutoff value.`

			`Items with count below this value are not considered part of vocabulary.`

			`"""`
			`return self._cutoff`

			`def update(self, counter_args, *counter_kwargs):`
			`"""Update vocabulary counts.`

			Wraps `collections.Counter.update` method.

			`"""`
			`self.counts.update(counter_args, *counter_kwargs)`

			`def lookup(self, words):`
			`"""Look up one or more words in the vocabulary.`

			If passed one word as a string will return that word or `self.unk_label`.
			`Otherwise will assume it was passed a sequence of words, will try to look`
			`each of them up and return an iterator over the looked up words.`

			`:param words: Word(s) to look up.`
			`:type words: Iterable(str) or str`
			`:rtype: generator(str) or str`
			`:raises: TypeError for types other than strings or iterables`

			`>>> from nltk.lm import Vocabulary`
			`>>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)`
			`>>> vocab.lookup("a")`
			`'a'`
			`>>> vocab.lookup("aliens")`
			`'<UNK>'`
			`>>> vocab.lookup(["a", "b", "c", ["x", "b"]])`
			`('a', 'b', '<UNK>', ('<UNK>', 'b'))`

			`"""`
			`return _dispatched_lookup(words, self)`

			`def __getitem__(self, item):`
			`return self._cutoff if item == self.unk_label else self.counts[item]`

			`def __contains__(self, item):`
			`"""Only consider items with counts GE to cutoff as being in the`
			`vocabulary."""`
			`return self[item] >= self.cutoff`

			`def __iter__(self):`
			`"""Building on membership check define how to iterate over`
			`vocabulary."""`
			`return chain(`
			`(item for item in self.counts if item in self),`
			`[self.unk_label] if self.counts else [],`
			`)`

			`def __len__(self):`
			`"""Computing size of vocabulary reflects the cutoff."""`
			`return sum(1 for _ in self)`

			`def __eq__(self, other):`
			`return (`
			`self.unk_label == other.unk_label`
			`and self.cutoff == other.cutoff`
			`and self.counts == other.counts`
			`)`

			`def __str__(self):`
			`return "<{0} with cutoff={1} unk_label='{2}' and {3} items>".format(`
			`self.__class__.__name__, self.cutoff, self.unk_label, len(self)`
			`)`