You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
394 lines
13 KiB
Python
394 lines
13 KiB
Python
6 years ago
|
from __future__ import division, print_function, absolute_import
|
||
|
|
||
|
from scipy import stats
|
||
|
import numpy as np
|
||
|
from numpy.testing import (assert_almost_equal, assert_,
|
||
|
assert_array_almost_equal, assert_array_almost_equal_nulp, assert_allclose)
|
||
|
import pytest
|
||
|
from pytest import raises as assert_raises
|
||
|
|
||
|
|
||
|
def test_kde_1d():
|
||
|
#some basic tests comparing to normal distribution
|
||
|
np.random.seed(8765678)
|
||
|
n_basesample = 500
|
||
|
xn = np.random.randn(n_basesample)
|
||
|
xnmean = xn.mean()
|
||
|
xnstd = xn.std(ddof=1)
|
||
|
|
||
|
# get kde for original sample
|
||
|
gkde = stats.gaussian_kde(xn)
|
||
|
|
||
|
# evaluate the density function for the kde for some points
|
||
|
xs = np.linspace(-7,7,501)
|
||
|
kdepdf = gkde.evaluate(xs)
|
||
|
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
|
||
|
intervall = xs[1] - xs[0]
|
||
|
|
||
|
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
|
||
|
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
|
||
|
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
|
||
|
assert_almost_equal(prob1, 0.5, decimal=1)
|
||
|
assert_almost_equal(prob2, 0.5, decimal=1)
|
||
|
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
|
||
|
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
|
||
|
|
||
|
assert_almost_equal(gkde.integrate_kde(gkde),
|
||
|
(kdepdf**2).sum()*intervall, decimal=2)
|
||
|
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
|
||
|
(kdepdf*normpdf).sum()*intervall, decimal=2)
|
||
|
|
||
|
|
||
|
def test_kde_1d_weighted():
|
||
|
#some basic tests comparing to normal distribution
|
||
|
np.random.seed(8765678)
|
||
|
n_basesample = 500
|
||
|
xn = np.random.randn(n_basesample)
|
||
|
wn = np.random.rand(n_basesample)
|
||
|
xnmean = np.average(xn, weights=wn)
|
||
|
xnstd = np.sqrt(np.average((xn-xnmean)**2, weights=wn))
|
||
|
|
||
|
# get kde for original sample
|
||
|
gkde = stats.gaussian_kde(xn, weights=wn)
|
||
|
|
||
|
# evaluate the density function for the kde for some points
|
||
|
xs = np.linspace(-7,7,501)
|
||
|
kdepdf = gkde.evaluate(xs)
|
||
|
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
|
||
|
intervall = xs[1] - xs[0]
|
||
|
|
||
|
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
|
||
|
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
|
||
|
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
|
||
|
assert_almost_equal(prob1, 0.5, decimal=1)
|
||
|
assert_almost_equal(prob2, 0.5, decimal=1)
|
||
|
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
|
||
|
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
|
||
|
|
||
|
assert_almost_equal(gkde.integrate_kde(gkde),
|
||
|
(kdepdf**2).sum()*intervall, decimal=2)
|
||
|
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
|
||
|
(kdepdf*normpdf).sum()*intervall, decimal=2)
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
def test_kde_2d():
|
||
|
#some basic tests comparing to normal distribution
|
||
|
np.random.seed(8765678)
|
||
|
n_basesample = 500
|
||
|
|
||
|
mean = np.array([1.0, 3.0])
|
||
|
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
|
||
|
|
||
|
# Need transpose (shape (2, 500)) for kde
|
||
|
xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
|
||
|
|
||
|
# get kde for original sample
|
||
|
gkde = stats.gaussian_kde(xn)
|
||
|
|
||
|
# evaluate the density function for the kde for some points
|
||
|
x, y = np.mgrid[-7:7:500j, -7:7:500j]
|
||
|
grid_coords = np.vstack([x.ravel(), y.ravel()])
|
||
|
kdepdf = gkde.evaluate(grid_coords)
|
||
|
kdepdf = kdepdf.reshape(500, 500)
|
||
|
|
||
|
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
|
||
|
intervall = y.ravel()[1] - y.ravel()[0]
|
||
|
|
||
|
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
|
||
|
|
||
|
small = -1e100
|
||
|
large = 1e100
|
||
|
prob1 = gkde.integrate_box([small, mean[1]], [large, large])
|
||
|
prob2 = gkde.integrate_box([small, small], [large, mean[1]])
|
||
|
|
||
|
assert_almost_equal(prob1, 0.5, decimal=1)
|
||
|
assert_almost_equal(prob2, 0.5, decimal=1)
|
||
|
assert_almost_equal(gkde.integrate_kde(gkde),
|
||
|
(kdepdf**2).sum()*(intervall**2), decimal=2)
|
||
|
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
|
||
|
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
def test_kde_2d_weighted():
|
||
|
#some basic tests comparing to normal distribution
|
||
|
np.random.seed(8765678)
|
||
|
n_basesample = 500
|
||
|
|
||
|
mean = np.array([1.0, 3.0])
|
||
|
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
|
||
|
|
||
|
# Need transpose (shape (2, 500)) for kde
|
||
|
xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
|
||
|
wn = np.random.rand(n_basesample)
|
||
|
|
||
|
# get kde for original sample
|
||
|
gkde = stats.gaussian_kde(xn, weights=wn)
|
||
|
|
||
|
# evaluate the density function for the kde for some points
|
||
|
x, y = np.mgrid[-7:7:500j, -7:7:500j]
|
||
|
grid_coords = np.vstack([x.ravel(), y.ravel()])
|
||
|
kdepdf = gkde.evaluate(grid_coords)
|
||
|
kdepdf = kdepdf.reshape(500, 500)
|
||
|
|
||
|
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
|
||
|
intervall = y.ravel()[1] - y.ravel()[0]
|
||
|
|
||
|
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
|
||
|
|
||
|
small = -1e100
|
||
|
large = 1e100
|
||
|
prob1 = gkde.integrate_box([small, mean[1]], [large, large])
|
||
|
prob2 = gkde.integrate_box([small, small], [large, mean[1]])
|
||
|
|
||
|
assert_almost_equal(prob1, 0.5, decimal=1)
|
||
|
assert_almost_equal(prob2, 0.5, decimal=1)
|
||
|
assert_almost_equal(gkde.integrate_kde(gkde),
|
||
|
(kdepdf**2).sum()*(intervall**2), decimal=2)
|
||
|
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
|
||
|
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
|
||
|
|
||
|
|
||
|
def test_kde_bandwidth_method():
|
||
|
def scotts_factor(kde_obj):
|
||
|
"""Same as default, just check that it works."""
|
||
|
return np.power(kde_obj.n, -1./(kde_obj.d+4))
|
||
|
|
||
|
np.random.seed(8765678)
|
||
|
n_basesample = 50
|
||
|
xn = np.random.randn(n_basesample)
|
||
|
|
||
|
# Default
|
||
|
gkde = stats.gaussian_kde(xn)
|
||
|
# Supply a callable
|
||
|
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
|
||
|
# Supply a scalar
|
||
|
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
|
||
|
|
||
|
xs = np.linspace(-7,7,51)
|
||
|
kdepdf = gkde.evaluate(xs)
|
||
|
kdepdf2 = gkde2.evaluate(xs)
|
||
|
assert_almost_equal(kdepdf, kdepdf2)
|
||
|
kdepdf3 = gkde3.evaluate(xs)
|
||
|
assert_almost_equal(kdepdf, kdepdf3)
|
||
|
|
||
|
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
|
||
|
|
||
|
|
||
|
def test_kde_bandwidth_method_weighted():
|
||
|
def scotts_factor(kde_obj):
|
||
|
"""Same as default, just check that it works."""
|
||
|
return np.power(kde_obj.neff, -1./(kde_obj.d+4))
|
||
|
|
||
|
np.random.seed(8765678)
|
||
|
n_basesample = 50
|
||
|
xn = np.random.randn(n_basesample)
|
||
|
|
||
|
# Default
|
||
|
gkde = stats.gaussian_kde(xn)
|
||
|
# Supply a callable
|
||
|
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
|
||
|
# Supply a scalar
|
||
|
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
|
||
|
|
||
|
xs = np.linspace(-7,7,51)
|
||
|
kdepdf = gkde.evaluate(xs)
|
||
|
kdepdf2 = gkde2.evaluate(xs)
|
||
|
assert_almost_equal(kdepdf, kdepdf2)
|
||
|
kdepdf3 = gkde3.evaluate(xs)
|
||
|
assert_almost_equal(kdepdf, kdepdf3)
|
||
|
|
||
|
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
|
||
|
|
||
|
|
||
|
# Subclasses that should stay working (extracted from various sources).
|
||
|
# Unfortunately the earlier design of gaussian_kde made it necessary for users
|
||
|
# to create these kinds of subclasses, or call _compute_covariance() directly.
|
||
|
|
||
|
class _kde_subclass1(stats.gaussian_kde):
|
||
|
def __init__(self, dataset):
|
||
|
self.dataset = np.atleast_2d(dataset)
|
||
|
self.d, self.n = self.dataset.shape
|
||
|
self.covariance_factor = self.scotts_factor
|
||
|
self._compute_covariance()
|
||
|
|
||
|
|
||
|
class _kde_subclass2(stats.gaussian_kde):
|
||
|
def __init__(self, dataset):
|
||
|
self.covariance_factor = self.scotts_factor
|
||
|
super(_kde_subclass2, self).__init__(dataset)
|
||
|
|
||
|
|
||
|
class _kde_subclass3(stats.gaussian_kde):
|
||
|
def __init__(self, dataset, covariance):
|
||
|
self.covariance = covariance
|
||
|
stats.gaussian_kde.__init__(self, dataset)
|
||
|
|
||
|
def _compute_covariance(self):
|
||
|
self.inv_cov = np.linalg.inv(self.covariance)
|
||
|
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \
|
||
|
* self.n
|
||
|
|
||
|
|
||
|
class _kde_subclass4(stats.gaussian_kde):
|
||
|
def covariance_factor(self):
|
||
|
return 0.5 * self.silverman_factor()
|
||
|
|
||
|
|
||
|
def test_gaussian_kde_subclassing():
|
||
|
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
|
||
|
xs = np.linspace(-10, 10, num=50)
|
||
|
|
||
|
# gaussian_kde itself
|
||
|
kde = stats.gaussian_kde(x1)
|
||
|
ys = kde(xs)
|
||
|
|
||
|
# subclass 1
|
||
|
kde1 = _kde_subclass1(x1)
|
||
|
y1 = kde1(xs)
|
||
|
assert_array_almost_equal_nulp(ys, y1, nulp=10)
|
||
|
|
||
|
# subclass 2
|
||
|
kde2 = _kde_subclass2(x1)
|
||
|
y2 = kde2(xs)
|
||
|
assert_array_almost_equal_nulp(ys, y2, nulp=10)
|
||
|
|
||
|
# subclass 3
|
||
|
kde3 = _kde_subclass3(x1, kde.covariance)
|
||
|
y3 = kde3(xs)
|
||
|
assert_array_almost_equal_nulp(ys, y3, nulp=10)
|
||
|
|
||
|
# subclass 4
|
||
|
kde4 = _kde_subclass4(x1)
|
||
|
y4 = kde4(x1)
|
||
|
y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
|
||
|
|
||
|
assert_array_almost_equal(y_expected, y4, decimal=6)
|
||
|
|
||
|
# Not a subclass, but check for use of _compute_covariance()
|
||
|
kde5 = kde
|
||
|
kde5.covariance_factor = lambda: kde.factor
|
||
|
kde5._compute_covariance()
|
||
|
y5 = kde5(xs)
|
||
|
assert_array_almost_equal_nulp(ys, y5, nulp=10)
|
||
|
|
||
|
|
||
|
def test_gaussian_kde_covariance_caching():
|
||
|
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
|
||
|
xs = np.linspace(-10, 10, num=5)
|
||
|
# These expected values are from scipy 0.10, before some changes to
|
||
|
# gaussian_kde. They were not compared with any external reference.
|
||
|
y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
|
||
|
|
||
|
# Set the bandwidth, then reset it to the default.
|
||
|
kde = stats.gaussian_kde(x1)
|
||
|
kde.set_bandwidth(bw_method=0.5)
|
||
|
kde.set_bandwidth(bw_method='scott')
|
||
|
y2 = kde(xs)
|
||
|
|
||
|
assert_array_almost_equal(y_expected, y2, decimal=7)
|
||
|
|
||
|
|
||
|
def test_gaussian_kde_monkeypatch():
|
||
|
"""Ugly, but people may rely on this. See scipy pull request 123,
|
||
|
specifically the linked ML thread "Width of the Gaussian in stats.kde".
|
||
|
If it is necessary to break this later on, that is to be discussed on ML.
|
||
|
"""
|
||
|
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
|
||
|
xs = np.linspace(-10, 10, num=50)
|
||
|
|
||
|
# The old monkeypatched version to get at Silverman's Rule.
|
||
|
kde = stats.gaussian_kde(x1)
|
||
|
kde.covariance_factor = kde.silverman_factor
|
||
|
kde._compute_covariance()
|
||
|
y1 = kde(xs)
|
||
|
|
||
|
# The new saner version.
|
||
|
kde2 = stats.gaussian_kde(x1, bw_method='silverman')
|
||
|
y2 = kde2(xs)
|
||
|
|
||
|
assert_array_almost_equal_nulp(y1, y2, nulp=10)
|
||
|
|
||
|
|
||
|
def test_kde_integer_input():
|
||
|
"""Regression test for #1181."""
|
||
|
x1 = np.arange(5)
|
||
|
kde = stats.gaussian_kde(x1)
|
||
|
y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
|
||
|
assert_array_almost_equal(kde(x1), y_expected, decimal=6)
|
||
|
|
||
|
|
||
|
def test_pdf_logpdf():
|
||
|
np.random.seed(1)
|
||
|
n_basesample = 50
|
||
|
xn = np.random.randn(n_basesample)
|
||
|
|
||
|
# Default
|
||
|
gkde = stats.gaussian_kde(xn)
|
||
|
|
||
|
xs = np.linspace(-15, 12, 25)
|
||
|
pdf = gkde.evaluate(xs)
|
||
|
pdf2 = gkde.pdf(xs)
|
||
|
assert_almost_equal(pdf, pdf2, decimal=12)
|
||
|
|
||
|
logpdf = np.log(pdf)
|
||
|
logpdf2 = gkde.logpdf(xs)
|
||
|
assert_almost_equal(logpdf, logpdf2, decimal=12)
|
||
|
|
||
|
# There are more points than data
|
||
|
gkde = stats.gaussian_kde(xs)
|
||
|
pdf = np.log(gkde.evaluate(xn))
|
||
|
pdf2 = gkde.logpdf(xn)
|
||
|
assert_almost_equal(pdf, pdf2, decimal=12)
|
||
|
|
||
|
|
||
|
def test_pdf_logpdf_weighted():
|
||
|
np.random.seed(1)
|
||
|
n_basesample = 50
|
||
|
xn = np.random.randn(n_basesample)
|
||
|
wn = np.random.rand(n_basesample)
|
||
|
|
||
|
# Default
|
||
|
gkde = stats.gaussian_kde(xn, weights=wn)
|
||
|
|
||
|
xs = np.linspace(-15, 12, 25)
|
||
|
pdf = gkde.evaluate(xs)
|
||
|
pdf2 = gkde.pdf(xs)
|
||
|
assert_almost_equal(pdf, pdf2, decimal=12)
|
||
|
|
||
|
logpdf = np.log(pdf)
|
||
|
logpdf2 = gkde.logpdf(xs)
|
||
|
assert_almost_equal(logpdf, logpdf2, decimal=12)
|
||
|
|
||
|
# There are more points than data
|
||
|
gkde = stats.gaussian_kde(xs)
|
||
|
pdf = np.log(gkde.evaluate(xn))
|
||
|
pdf2 = gkde.logpdf(xn)
|
||
|
assert_almost_equal(pdf, pdf2, decimal=12)
|
||
|
|
||
|
|
||
|
def test_weights_intact():
|
||
|
# regression test for gh-9709: weights are not modified
|
||
|
np.random.seed(12345)
|
||
|
vals = np.random.lognormal(size=100)
|
||
|
weights = np.random.choice([1.0, 10.0, 100], size=vals.size)
|
||
|
orig_weights = weights.copy()
|
||
|
|
||
|
stats.gaussian_kde(np.log10(vals), weights=weights)
|
||
|
assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
|
||
|
|
||
|
|
||
|
def test_weights_integer():
|
||
|
# integer weights are OK, cf gh-9709 (comment)
|
||
|
np.random.seed(12345)
|
||
|
values = [0.2, 13.5, 21.0, 75.0, 99.0]
|
||
|
weights = [1, 2, 4, 8, 16] # a list of integers
|
||
|
pdf_i = stats.gaussian_kde(values, weights=weights)
|
||
|
pdf_f = stats.gaussian_kde(values, weights=np.float64(weights))
|
||
|
|
||
|
xn = [0.3, 11, 88]
|
||
|
assert_allclose(pdf_i.evaluate(xn),
|
||
|
pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
|
||
|
|