You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
182 lines
7.1 KiB
Python
182 lines
7.1 KiB
Python
5 years ago
|
# Natural Language Toolkit: Positive Naive Bayes Classifier
|
||
|
#
|
||
|
# Copyright (C) 2012 NLTK Project
|
||
|
# Author: Alessandro Presta <alessandro.presta@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
A variant of the Naive Bayes Classifier that performs binary classification with
|
||
|
partially-labeled training sets. In other words, assume we want to build a classifier
|
||
|
that assigns each example to one of two complementary classes (e.g., male names and
|
||
|
female names).
|
||
|
If we have a training set with labeled examples for both classes, we can use a
|
||
|
standard Naive Bayes Classifier. However, consider the case when we only have labeled
|
||
|
examples for one of the classes, and other, unlabeled, examples.
|
||
|
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
|
||
|
to estimate the frequencies of the various features.
|
||
|
|
||
|
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
|
||
|
and unlabeled examples. We are also given an estimate of P(1).
|
||
|
|
||
|
We compute P(feature|1) exactly as in the standard case.
|
||
|
|
||
|
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
|
||
|
assuming that the unlabeled examples are drawn according to the given prior distribution)
|
||
|
and then express the conditional probability as:
|
||
|
|
||
|
| P(feature) - P(feature|1) * P(1)
|
||
|
| P(feature|0) = ----------------------------------
|
||
|
| P(0)
|
||
|
|
||
|
Example:
|
||
|
|
||
|
>>> from nltk.classify import PositiveNaiveBayesClassifier
|
||
|
|
||
|
Some sentences about sports:
|
||
|
|
||
|
>>> sports_sentences = [ 'The team dominated the game',
|
||
|
... 'They lost the ball',
|
||
|
... 'The game was intense',
|
||
|
... 'The goalkeeper catched the ball',
|
||
|
... 'The other team controlled the ball' ]
|
||
|
|
||
|
Mixed topics, including sports:
|
||
|
|
||
|
>>> various_sentences = [ 'The President did not comment',
|
||
|
... 'I lost the keys',
|
||
|
... 'The team won the game',
|
||
|
... 'Sara has two kids',
|
||
|
... 'The ball went off the court',
|
||
|
... 'They had the ball for the whole game',
|
||
|
... 'The show is over' ]
|
||
|
|
||
|
The features of a sentence are simply the words it contains:
|
||
|
|
||
|
>>> def features(sentence):
|
||
|
... words = sentence.lower().split()
|
||
|
... return dict(('contains(%s)' % w, True) for w in words)
|
||
|
|
||
|
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
|
||
|
|
||
|
>>> positive_featuresets = map(features, sports_sentences)
|
||
|
>>> unlabeled_featuresets = map(features, various_sentences)
|
||
|
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
|
||
|
... unlabeled_featuresets)
|
||
|
|
||
|
Is the following sentence about sports?
|
||
|
|
||
|
>>> classifier.classify(features('The cat is on the table'))
|
||
|
False
|
||
|
|
||
|
What about this one?
|
||
|
|
||
|
>>> classifier.classify(features('My team lost the game'))
|
||
|
True
|
||
|
"""
|
||
|
|
||
|
from collections import defaultdict
|
||
|
|
||
|
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist
|
||
|
|
||
|
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||
|
|
||
|
##//////////////////////////////////////////////////////
|
||
|
## Positive Naive Bayes Classifier
|
||
|
##//////////////////////////////////////////////////////
|
||
|
|
||
|
|
||
|
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
|
||
|
@staticmethod
|
||
|
def train(
|
||
|
positive_featuresets,
|
||
|
unlabeled_featuresets,
|
||
|
positive_prob_prior=0.5,
|
||
|
estimator=ELEProbDist,
|
||
|
):
|
||
|
"""
|
||
|
:param positive_featuresets: An iterable of featuresets that are known as positive
|
||
|
examples (i.e., their label is ``True``).
|
||
|
|
||
|
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
|
||
|
|
||
|
:param positive_prob_prior: A prior estimate of the probability of the label
|
||
|
``True`` (default 0.5).
|
||
|
"""
|
||
|
positive_feature_freqdist = defaultdict(FreqDist)
|
||
|
unlabeled_feature_freqdist = defaultdict(FreqDist)
|
||
|
feature_values = defaultdict(set)
|
||
|
fnames = set()
|
||
|
|
||
|
# Count up how many times each feature value occurred in positive examples.
|
||
|
num_positive_examples = 0
|
||
|
for featureset in positive_featuresets:
|
||
|
for fname, fval in featureset.items():
|
||
|
positive_feature_freqdist[fname][fval] += 1
|
||
|
feature_values[fname].add(fval)
|
||
|
fnames.add(fname)
|
||
|
num_positive_examples += 1
|
||
|
|
||
|
# Count up how many times each feature value occurred in unlabeled examples.
|
||
|
num_unlabeled_examples = 0
|
||
|
for featureset in unlabeled_featuresets:
|
||
|
for fname, fval in featureset.items():
|
||
|
unlabeled_feature_freqdist[fname][fval] += 1
|
||
|
feature_values[fname].add(fval)
|
||
|
fnames.add(fname)
|
||
|
num_unlabeled_examples += 1
|
||
|
|
||
|
# If a feature didn't have a value given for an instance, then we assume that
|
||
|
# it gets the implicit value 'None'.
|
||
|
for fname in fnames:
|
||
|
count = positive_feature_freqdist[fname].N()
|
||
|
positive_feature_freqdist[fname][None] += num_positive_examples - count
|
||
|
feature_values[fname].add(None)
|
||
|
|
||
|
for fname in fnames:
|
||
|
count = unlabeled_feature_freqdist[fname].N()
|
||
|
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
|
||
|
feature_values[fname].add(None)
|
||
|
|
||
|
negative_prob_prior = 1.0 - positive_prob_prior
|
||
|
|
||
|
# Create the P(label) distribution.
|
||
|
label_probdist = DictionaryProbDist(
|
||
|
{True: positive_prob_prior, False: negative_prob_prior}
|
||
|
)
|
||
|
|
||
|
# Create the P(fval|label, fname) distribution.
|
||
|
feature_probdist = {}
|
||
|
for fname, freqdist in positive_feature_freqdist.items():
|
||
|
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||
|
feature_probdist[True, fname] = probdist
|
||
|
|
||
|
for fname, freqdist in unlabeled_feature_freqdist.items():
|
||
|
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||
|
negative_feature_probs = {}
|
||
|
for fval in feature_values[fname]:
|
||
|
prob = (
|
||
|
global_probdist.prob(fval)
|
||
|
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
|
||
|
) / negative_prob_prior
|
||
|
# TODO: We need to add some kind of smoothing here, instead of
|
||
|
# setting negative probabilities to zero and normalizing.
|
||
|
negative_feature_probs[fval] = max(prob, 0.0)
|
||
|
feature_probdist[False, fname] = DictionaryProbDist(
|
||
|
negative_feature_probs, normalize=True
|
||
|
)
|
||
|
|
||
|
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
|
||
|
|
||
|
|
||
|
##//////////////////////////////////////////////////////
|
||
|
## Demo
|
||
|
##//////////////////////////////////////////////////////
|
||
|
|
||
|
|
||
|
def demo():
|
||
|
from nltk.classify.util import partial_names_demo
|
||
|
|
||
|
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
|
||
|
classifier.show_most_informative_features()
|