You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
2.8 KiB
Python

# Natural Language Toolkit: Minimal Sets
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
from collections import defaultdict
class MinimalSet(object):
"""
Find contexts where more than one possible target value can
appear. E.g. if targets are word-initial letters, and contexts
are the remainders of words, then we would like to find cases like
"fat" vs "cat", and "training" vs "draining". If targets are
parts-of-speech and contexts are words, then we would like to find
cases like wind (noun) 'air in rapid motion', vs wind (verb)
'coil, wrap'.
"""
def __init__(self, parameters=None):
"""
Create a new minimal set.
:param parameters: The (context, target, display) tuples for the item
:type parameters: list(tuple(str, str, str))
"""
self._targets = set() # the contrastive information
self._contexts = set() # what we are controlling for
self._seen = defaultdict(set) # to record what we have seen
self._displays = {} # what we will display
if parameters:
for context, target, display in parameters:
self.add(context, target, display)
def add(self, context, target, display):
"""
Add a new item to the minimal set, having the specified
context, target, and display form.
:param context: The context in which the item of interest appears
:type context: str
:param target: The item of interest
:type target: str
:param display: The information to be reported for each item
:type display: str
"""
# Store the set of targets that occurred in this context
self._seen[context].add(target)
# Keep track of which contexts and targets we have seen
self._contexts.add(context)
self._targets.add(target)
# For a given context and target, store the display form
self._displays[(context, target)] = display
def contexts(self, minimum=2):
"""
Determine which contexts occurred with enough distinct targets.
:param minimum: the minimum number of distinct target forms
:type minimum: int
:rtype list
"""
return [c for c in self._contexts if len(self._seen[c]) >= minimum]
def display(self, context, target, default=""):
if (context, target) in self._displays:
return self._displays[(context, target)]
else:
return default
def display_all(self, context):
result = []
for target in self._targets:
x = self.display(context, target)
if x:
result.append(x)
return result
def targets(self):
return self._targets