You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
345 lines
12 KiB
Python
345 lines
12 KiB
Python
# Natural Language Toolkit: Classifier Utility Functions
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
Utility functions and classes for classifiers.
|
|
"""
|
|
from __future__ import print_function, division
|
|
|
|
import math
|
|
|
|
# from nltk.util import Deprecated
|
|
import nltk.classify.util # for accuracy & log_likelihood
|
|
from nltk.util import LazyMap
|
|
|
|
######################################################################
|
|
# { Helper Functions
|
|
######################################################################
|
|
|
|
# alternative name possibility: 'map_featurefunc()'?
|
|
# alternative name possibility: 'detect_features()'?
|
|
# alternative name possibility: 'map_featuredetect()'?
|
|
# or.. just have users use LazyMap directly?
|
|
def apply_features(feature_func, toks, labeled=None):
|
|
"""
|
|
Use the ``LazyMap`` class to construct a lazy list-like
|
|
object that is analogous to ``map(feature_func, toks)``. In
|
|
particular, if ``labeled=False``, then the returned list-like
|
|
object's values are equal to::
|
|
|
|
[feature_func(tok) for tok in toks]
|
|
|
|
If ``labeled=True``, then the returned list-like object's values
|
|
are equal to::
|
|
|
|
[(feature_func(tok), label) for (tok, label) in toks]
|
|
|
|
The primary purpose of this function is to avoid the memory
|
|
overhead involved in storing all the featuresets for every token
|
|
in a corpus. Instead, these featuresets are constructed lazily,
|
|
as-needed. The reduction in memory overhead can be especially
|
|
significant when the underlying list of tokens is itself lazy (as
|
|
is the case with many corpus readers).
|
|
|
|
:param feature_func: The function that will be applied to each
|
|
token. It should return a featureset -- i.e., a dict
|
|
mapping feature names to feature values.
|
|
:param toks: The list of tokens to which ``feature_func`` should be
|
|
applied. If ``labeled=True``, then the list elements will be
|
|
passed directly to ``feature_func()``. If ``labeled=False``,
|
|
then the list elements should be tuples ``(tok,label)``, and
|
|
``tok`` will be passed to ``feature_func()``.
|
|
:param labeled: If true, then ``toks`` contains labeled tokens --
|
|
i.e., tuples of the form ``(tok, label)``. (Default:
|
|
auto-detect based on types.)
|
|
"""
|
|
if labeled is None:
|
|
labeled = toks and isinstance(toks[0], (tuple, list))
|
|
if labeled:
|
|
|
|
def lazy_func(labeled_token):
|
|
return (feature_func(labeled_token[0]), labeled_token[1])
|
|
|
|
return LazyMap(lazy_func, toks)
|
|
else:
|
|
return LazyMap(feature_func, toks)
|
|
|
|
|
|
def attested_labels(tokens):
|
|
"""
|
|
:return: A list of all labels that are attested in the given list
|
|
of tokens.
|
|
:rtype: list of (immutable)
|
|
:param tokens: The list of classified tokens from which to extract
|
|
labels. A classified token has the form ``(token, label)``.
|
|
:type tokens: list
|
|
"""
|
|
return tuple(set(label for (tok, label) in tokens))
|
|
|
|
|
|
def log_likelihood(classifier, gold):
|
|
results = classifier.prob_classify_many([fs for (fs, l) in gold])
|
|
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
|
|
return math.log(sum(ll) / len(ll))
|
|
|
|
|
|
def accuracy(classifier, gold):
|
|
results = classifier.classify_many([fs for (fs, l) in gold])
|
|
correct = [l == r for ((fs, l), r) in zip(gold, results)]
|
|
if correct:
|
|
return sum(correct) / len(correct)
|
|
else:
|
|
return 0
|
|
|
|
|
|
class CutoffChecker(object):
|
|
"""
|
|
A helper class that implements cutoff checks based on number of
|
|
iterations and log likelihood.
|
|
|
|
Accuracy cutoffs are also implemented, but they're almost never
|
|
a good idea to use.
|
|
"""
|
|
|
|
def __init__(self, cutoffs):
|
|
self.cutoffs = cutoffs.copy()
|
|
if 'min_ll' in cutoffs:
|
|
cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
|
|
if 'min_lldelta' in cutoffs:
|
|
cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
|
|
self.ll = None
|
|
self.acc = None
|
|
self.iter = 1
|
|
|
|
def check(self, classifier, train_toks):
|
|
cutoffs = self.cutoffs
|
|
self.iter += 1
|
|
if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
|
|
return True # iteration cutoff.
|
|
|
|
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
|
|
if math.isnan(new_ll):
|
|
return True
|
|
|
|
if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
|
|
if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
|
|
return True # log likelihood cutoff
|
|
if (
|
|
'min_lldelta' in cutoffs
|
|
and self.ll
|
|
and ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))
|
|
):
|
|
return True # log likelihood delta cutoff
|
|
self.ll = new_ll
|
|
|
|
if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
|
|
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
|
|
if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
|
|
return True # log likelihood cutoff
|
|
if (
|
|
'min_accdelta' in cutoffs
|
|
and self.acc
|
|
and ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))
|
|
):
|
|
return True # log likelihood delta cutoff
|
|
self.acc = new_acc
|
|
|
|
return False # no cutoff reached.
|
|
|
|
|
|
######################################################################
|
|
# { Demos
|
|
######################################################################
|
|
|
|
|
|
def names_demo_features(name):
|
|
features = {}
|
|
features['alwayson'] = True
|
|
features['startswith'] = name[0].lower()
|
|
features['endswith'] = name[-1].lower()
|
|
for letter in 'abcdefghijklmnopqrstuvwxyz':
|
|
features['count(%s)' % letter] = name.lower().count(letter)
|
|
features['has(%s)' % letter] = letter in name.lower()
|
|
return features
|
|
|
|
|
|
def binary_names_demo_features(name):
|
|
features = {}
|
|
features['alwayson'] = True
|
|
features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
|
|
features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
|
|
for letter in 'abcdefghijklmnopqrstuvwxyz':
|
|
features['count(%s)' % letter] = name.lower().count(letter)
|
|
features['has(%s)' % letter] = letter in name.lower()
|
|
features['startswith(%s)' % letter] = letter == name[0].lower()
|
|
features['endswith(%s)' % letter] = letter == name[-1].lower()
|
|
return features
|
|
|
|
|
|
def names_demo(trainer, features=names_demo_features):
|
|
from nltk.corpus import names
|
|
import random
|
|
|
|
# Construct a list of classified names, using the names corpus.
|
|
namelist = [(name, 'male') for name in names.words('male.txt')] + [
|
|
(name, 'female') for name in names.words('female.txt')
|
|
]
|
|
|
|
# Randomly split the names into a test & train set.
|
|
random.seed(123456)
|
|
random.shuffle(namelist)
|
|
train = namelist[:5000]
|
|
test = namelist[5000:5500]
|
|
|
|
# Train up a classifier.
|
|
print('Training classifier...')
|
|
classifier = trainer([(features(n), g) for (n, g) in train])
|
|
|
|
# Run the classifier on the test data.
|
|
print('Testing classifier...')
|
|
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
|
|
print('Accuracy: %6.4f' % acc)
|
|
|
|
# For classifiers that can find probabilities, show the log
|
|
# likelihood and some sample probability distributions.
|
|
try:
|
|
test_featuresets = [features(n) for (n, g) in test]
|
|
pdists = classifier.prob_classify_many(test_featuresets)
|
|
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
|
print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
|
|
print()
|
|
print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
|
|
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
|
|
if gender == 'male':
|
|
fmt = ' %-15s *%6.4f %6.4f'
|
|
else:
|
|
fmt = ' %-15s %6.4f *%6.4f'
|
|
print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
|
|
except NotImplementedError:
|
|
pass
|
|
|
|
# Return the classifier
|
|
return classifier
|
|
|
|
|
|
def partial_names_demo(trainer, features=names_demo_features):
|
|
from nltk.corpus import names
|
|
import random
|
|
|
|
male_names = names.words('male.txt')
|
|
female_names = names.words('female.txt')
|
|
|
|
random.seed(654321)
|
|
random.shuffle(male_names)
|
|
random.shuffle(female_names)
|
|
|
|
# Create a list of male names to be used as positive-labeled examples for training
|
|
positive = map(features, male_names[:2000])
|
|
|
|
# Create a list of male and female names to be used as unlabeled examples
|
|
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
|
|
|
|
# Create a test set with correctly-labeled male and female names
|
|
test = [(name, True) for name in male_names[2500:2750]] + [
|
|
(name, False) for name in female_names[500:750]
|
|
]
|
|
|
|
random.shuffle(test)
|
|
|
|
# Train up a classifier.
|
|
print('Training classifier...')
|
|
classifier = trainer(positive, unlabeled)
|
|
|
|
# Run the classifier on the test data.
|
|
print('Testing classifier...')
|
|
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
|
|
print('Accuracy: %6.4f' % acc)
|
|
|
|
# For classifiers that can find probabilities, show the log
|
|
# likelihood and some sample probability distributions.
|
|
try:
|
|
test_featuresets = [features(n) for (n, m) in test]
|
|
pdists = classifier.prob_classify_many(test_featuresets)
|
|
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
|
print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
|
|
print()
|
|
print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
|
|
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
|
|
if is_male == True:
|
|
fmt = ' %-15s *%6.4f %6.4f'
|
|
else:
|
|
fmt = ' %-15s %6.4f *%6.4f'
|
|
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
|
|
except NotImplementedError:
|
|
pass
|
|
|
|
# Return the classifier
|
|
return classifier
|
|
|
|
|
|
_inst_cache = {}
|
|
|
|
|
|
def wsd_demo(trainer, word, features, n=1000):
|
|
from nltk.corpus import senseval
|
|
import random
|
|
|
|
# Get the instances.
|
|
print('Reading data...')
|
|
global _inst_cache
|
|
if word not in _inst_cache:
|
|
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
|
|
instances = _inst_cache[word][:]
|
|
if n > len(instances):
|
|
n = len(instances)
|
|
senses = list(set(l for (i, l) in instances))
|
|
print(' Senses: ' + ' '.join(senses))
|
|
|
|
# Randomly split the names into a test & train set.
|
|
print('Splitting into test & train...')
|
|
random.seed(123456)
|
|
random.shuffle(instances)
|
|
train = instances[: int(0.8 * n)]
|
|
test = instances[int(0.8 * n) : n]
|
|
|
|
# Train up a classifier.
|
|
print('Training classifier...')
|
|
classifier = trainer([(features(i), l) for (i, l) in train])
|
|
|
|
# Run the classifier on the test data.
|
|
print('Testing classifier...')
|
|
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
|
|
print('Accuracy: %6.4f' % acc)
|
|
|
|
# For classifiers that can find probabilities, show the log
|
|
# likelihood and some sample probability distributions.
|
|
try:
|
|
test_featuresets = [features(i) for (i, n) in test]
|
|
pdists = classifier.prob_classify_many(test_featuresets)
|
|
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
|
print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
|
|
except NotImplementedError:
|
|
pass
|
|
|
|
# Return the classifier
|
|
return classifier
|
|
|
|
|
|
def check_megam_config():
|
|
"""
|
|
Checks whether the MEGAM binary is configured.
|
|
"""
|
|
try:
|
|
_megam_bin
|
|
except NameError:
|
|
err_msg = str(
|
|
"Please configure your megam binary first, e.g.\n"
|
|
">>> nltk.config_megam('/usr/bin/local/megam')"
|
|
)
|
|
raise NameError(err_msg)
|