You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
469 lines
18 KiB
Python
469 lines
18 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip, filter
|
|
from builtins import object, range
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
import unittest
|
|
import time
|
|
import math
|
|
|
|
from types import GeneratorType
|
|
|
|
from pattern import metrics
|
|
|
|
try:
|
|
PATH = os.path.dirname(os.path.realpath(__file__))
|
|
except:
|
|
PATH = ""
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestProfiling(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
# Test set for accuracy, precision and recall:
|
|
self.documents = (
|
|
(None, True),
|
|
(None, True),
|
|
(None, False)
|
|
)
|
|
|
|
def test_duration(self):
|
|
# Assert 0.1 or slightly higher.
|
|
v = metrics.duration(time.sleep, 0.1)
|
|
self.assertTrue(v > 0.1)
|
|
print("pattern.metrics.duration()")
|
|
|
|
def test_confustion_matrix(self):
|
|
# Assert 2 true positives (TP) and 1 false positive (FP).
|
|
v = metrics.confusion_matrix(lambda document: True, self.documents)
|
|
self.assertEqual(v, (2, 0, 1, 0))
|
|
# Assert 1 true negative (TN) and 2 false negatives (FN).
|
|
v = metrics.confusion_matrix(lambda document: False, self.documents)
|
|
self.assertEqual(v, (0, 1, 0, 2))
|
|
print("pattern.metrics.confusion_matrix()")
|
|
|
|
def test_accuracy(self):
|
|
# Assert 2.0/3.0 (two out of three correct predictions).
|
|
v = metrics.accuracy(lambda document: True, self.documents)
|
|
self.assertEqual(v, 2.0 / 3.0)
|
|
print("pattern.metrics.accuracy()")
|
|
|
|
def test_precision(self):
|
|
# Assert 2.0/3.0 (2 TP, 1 FP).
|
|
v = metrics.precision(lambda document: True, self.documents)
|
|
self.assertEqual(v, 2.0 / 3.0)
|
|
# Assert 0.0 (no TP).
|
|
v = metrics.precision(lambda document: False, self.documents)
|
|
self.assertEqual(v, 0.0)
|
|
print("pattern.metrics.precision()")
|
|
|
|
def test_recall(self):
|
|
# Assert 1.0 (no FN).
|
|
v = metrics.recall(lambda document: True, self.documents)
|
|
self.assertEqual(v, 1.0)
|
|
# Assert 0.0 (no TP).
|
|
v = metrics.recall(lambda document: False, self.documents)
|
|
self.assertEqual(v, 0.0)
|
|
print("pattern.metrics.recall()")
|
|
|
|
def test_F1(self):
|
|
# Assert 0.8 (F1 for precision=2/3 and recall=1).
|
|
v = metrics.F1(lambda document: True, self.documents)
|
|
self.assertEqual(v, 0.8)
|
|
self.assertEqual(v, metrics.F(lambda document: True, self.documents, beta=1))
|
|
print("pattern.metrics.F1()")
|
|
|
|
def test_agreement(self):
|
|
# Assert 0.210 (example from http://en.wikipedia.org/wiki/Fleiss'_kappa).
|
|
m = [[0, 0, 0, 0, 14],
|
|
[0, 2, 6, 4, 2 ],
|
|
[0, 0, 3, 5, 6 ],
|
|
[0, 3, 9, 2, 0 ],
|
|
[2, 2, 8, 1, 1 ],
|
|
[7, 7, 0, 0, 0 ],
|
|
[3, 2, 6, 3, 0 ],
|
|
[2, 5, 3, 2, 2 ],
|
|
[6, 5, 2, 1, 0 ],
|
|
[0, 2, 2, 3, 7 ]]
|
|
v = metrics.agreement(m)
|
|
self.assertAlmostEqual(v, 0.210, places=3)
|
|
print("pattern.metrics.agreement()")
|
|
|
|
|
|
class TestTextMetrics(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_levenshtein(self):
|
|
# Assert 0 (identical strings).
|
|
v = metrics.levenshtein("gallahad", "gallahad")
|
|
self.assertEqual(v, 0)
|
|
# Assert 3 (1 insert, 1 delete, 1 replace).
|
|
v = metrics.levenshtein("gallahad", "_g_llaha")
|
|
self.assertEqual(v, 3)
|
|
print("pattern.metrics.levenshtein()")
|
|
|
|
def test_levenshtein_similarity(self):
|
|
# Assert 1.0 (identical strings).
|
|
v = metrics.levenshtein_similarity("gallahad", "gallahad")
|
|
self.assertEqual(v, 1.0)
|
|
# Assert 0.75 (2 out of 8 characters differ).
|
|
v = metrics.levenshtein_similarity("gallahad", "g_ll_had")
|
|
self.assertEqual(v, 0.75)
|
|
print("pattern.metrics.levenshtein_similarity()")
|
|
|
|
def test_dice_coefficient(self):
|
|
# Assert 1.0 (identical strings).
|
|
v = metrics.dice_coefficient("gallahad", "gallahad")
|
|
self.assertEqual(v, 1.0)
|
|
# Assert 0.25 (example from http://en.wikipedia.org/wiki/Dice_coefficient).
|
|
v = metrics.dice_coefficient("night", "nacht")
|
|
self.assertEqual(v, 0.25)
|
|
print("pattern.metrics.dice_coefficient()")
|
|
|
|
def test_similarity(self):
|
|
self.assertEqual(
|
|
metrics.levenshtein_similarity("night", "nacht"),
|
|
metrics.similarity("night", "nacht", metrics.LEVENSHTEIN))
|
|
self.assertEqual(
|
|
metrics.dice_coefficient("night", "nacht"),
|
|
metrics.similarity("night", "nacht", metrics.DICE))
|
|
print("pattern.metrics.similarity()")
|
|
|
|
def test_readability(self):
|
|
# Assert that technical jargon is in the "difficult" range (< 0.30).
|
|
s = "The Australian platypus is seemingly a hybrid of a mammal and reptilian creature."
|
|
v = metrics.readability(s)
|
|
self.assertTrue(v < 0.30)
|
|
# Assert that Dr. Seuss is in the "easy" range (> 0.70).
|
|
s = "'I know some good games we could play,' said the cat. " + \
|
|
"'I know some new tricks,' said the cat in the hat. " + \
|
|
"'A lot of good tricks. I will show them to you.' " + \
|
|
"'Your mother will not mind at all if I do.'"
|
|
v = metrics.readability(s)
|
|
self.assertTrue(v > 0.70)
|
|
print("pattern.metrics.readability()")
|
|
|
|
def test_intertextuality(self):
|
|
# Evaluate accuracy for plagiarism detection.
|
|
from pattern.db import Datasheet
|
|
data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv"))
|
|
data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data]
|
|
|
|
def plagiarism(txt, src):
|
|
return metrics.intertextuality([txt, src], n=3)[0, 1] > 0.05
|
|
A, P, R, F = metrics.test(lambda x: plagiarism(*x), data)
|
|
self.assertTrue(P > 0.96)
|
|
self.assertTrue(R > 0.94)
|
|
print("pattern.metrics.intertextuality()")
|
|
|
|
def test_ttr(self):
|
|
# Assert type-token ratio: words = 7, unique words = 6.
|
|
s = "The black cat \n sat on the mat."
|
|
v = metrics.ttr(s)
|
|
self.assertAlmostEqual(v, 0.86, places=2)
|
|
print("pattern.metrics.ttr()")
|
|
|
|
def test_suffixes(self):
|
|
# Assert base => inflected and reversed inflected => base suffixes.
|
|
s = [("beau", "beaux"), ("jeune", "jeunes"), ("hautain", "hautaines")]
|
|
v = metrics.suffixes(s, n=3)
|
|
self.assertEqual(v, [
|
|
(2, "nes", [("ne", 0.5), ("n", 0.5)]),
|
|
(1, "aux", [("au", 1.0)])])
|
|
v = metrics.suffixes(s, n=2, reverse=False)
|
|
self.assertEqual(v, [
|
|
(1, "ne", [("nes", 1.0)]),
|
|
(1, "in", [("ines", 1.0)]),
|
|
(1, "au", [("aux", 1.0)])])
|
|
print("pattern.metrics.suffixes()")
|
|
|
|
def test_isplit(self):
|
|
# Assert string.split() iterator.
|
|
v = metrics.isplit("test\nisplit")
|
|
self.assertTrue(isinstance(v, GeneratorType))
|
|
self.assertEqual(list(v), ["test", "isplit"])
|
|
print("pattern.metrics.isplit()")
|
|
|
|
def test_cooccurrence(self):
|
|
s = "The black cat sat on the mat."
|
|
v = metrics.cooccurrence(s, window=(-1, 1),
|
|
term1 = lambda w: w in ("cat",),
|
|
normalize = lambda w: w.lower().strip(".:;,!?()[]'\""))
|
|
self.assertEqual(sorted(v.keys()), ["cat"])
|
|
self.assertEqual(sorted(v["cat"].keys()), ["black", "cat", "sat"])
|
|
self.assertEqual(sorted(v["cat"].values()), [1, 1, 1])
|
|
s = [("The", "DT"), ("black", "JJ"), ("cat", "NN"), ("sat", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
|
|
v = metrics.co_occurrence(s, window=(-2, -1),
|
|
term1 = lambda token: token[1].startswith("NN"),
|
|
term2 = lambda token: token[1].startswith("JJ"))
|
|
self.assertEqual(v, {("cat", "NN"): {("black", "JJ"): 1}})
|
|
print("pattern.metrics.cooccurrence()")
|
|
|
|
|
|
class TestInterpolation(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_lerp(self):
|
|
# Assert linear interpolation.
|
|
v = metrics.lerp(100, 200, 0.5)
|
|
self.assertEqual(v, 150.0)
|
|
print("pattern.metrics.lerp()")
|
|
|
|
def test_smoothstep(self):
|
|
# Assert cubic interpolation.
|
|
v1 = metrics.smoothstep(0.0, 1.0, 0.5)
|
|
v2 = metrics.smoothstep(0.0, 1.0, 0.9)
|
|
v3 = metrics.smoothstep(0.0, 1.0, 0.1)
|
|
self.assertEqual(v1, 0.5)
|
|
self.assertTrue(v2 > 0.9)
|
|
self.assertTrue(v3 < 0.1)
|
|
print("pattern.metrics.smoothstep()")
|
|
|
|
def test_smoothrange(self):
|
|
# Assert nice ranges for line charts.
|
|
v = list(metrics.smoothrange(0.0, 1.0))
|
|
[self.assertAlmostEqual(x, y, places=1) for x, y in zip(v,
|
|
[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])]
|
|
v = list(metrics.smoothrange(-2, 2))
|
|
[self.assertAlmostEqual(x, y, places=1) for x, y in zip(v,
|
|
[-2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0])]
|
|
v = list(metrics.smoothrange(1, 13))
|
|
[self.assertAlmostEqual(x, y, places=1) for x, y in zip(v,
|
|
[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0])]
|
|
print("pattern.metrics.smoothrange()")
|
|
|
|
|
|
class TestStatistics(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_mean(self):
|
|
# Assert (1+2+3+4) / 4 = 2.5.
|
|
v = metrics.mean([1, 2, 3, 4])
|
|
self.assertEqual(v, 2.5)
|
|
print("pattern.metrics.mean()")
|
|
|
|
def test_median(self):
|
|
# Assert 2.5 (between 2 and 3).
|
|
v = metrics.median([1, 2, 3, 4])
|
|
self.assertEqual(v, 2.5)
|
|
# Assert 3 (middle of list).
|
|
v = metrics.median([1, 2, 3, 4, 5])
|
|
self.assertEqual(v, 3)
|
|
# Assert that empty list raises ValueError.
|
|
self.assertRaises(ValueError, metrics.median, [])
|
|
print("pattern.metrics.median()")
|
|
|
|
def test_variance(self):
|
|
# Assert 2.5.
|
|
v = metrics.variance([1, 2, 3, 4, 5], sample=True)
|
|
self.assertEqual(v, 2.5)
|
|
# Assert 2.0 (population variance).
|
|
v = metrics.variance([1, 2, 3, 4, 5], sample=False)
|
|
self.assertEqual(v, 2.0)
|
|
print("pattern.metrics.variance()")
|
|
|
|
def test_standard_deviation(self):
|
|
# Assert 2.429 (sample).
|
|
v = metrics.standard_deviation([1, 5, 6, 7, 6, 8], sample=True)
|
|
self.assertAlmostEqual(v, 2.429, places=3)
|
|
# Assert 2.217 (population).
|
|
v = metrics.standard_deviation([1, 5, 6, 7, 6, 8], sample=False)
|
|
self.assertAlmostEqual(v, 2.217, places=3)
|
|
print("pattern.metrics.standard_deviation()")
|
|
|
|
def test_histogram(self):
|
|
# Assert 1 bin.
|
|
v = metrics.histogram([1, 2, 3, 4], k=0)
|
|
self.assertTrue(len(v) == 1)
|
|
# Assert 4 bins, each with one value, each with midpoint == value.
|
|
v = metrics.histogram([1, 2, 3, 4], k=4, range=(0.5, 4.5))
|
|
for i, ((start, stop), v) in enumerate(sorted(v.items())):
|
|
self.assertTrue(i + 1 == v[0])
|
|
self.assertAlmostEqual(start + (stop - start) / 2, i + 1, places=3)
|
|
# Assert 2 bins, one with all the low numbers, one with the high number.
|
|
v = metrics.histogram([1, 2, 3, 4, 100], k=2)
|
|
v = sorted(v.values(), key=lambda item: len(item))
|
|
self.assertTrue(v[0] == [100])
|
|
self.assertTrue(v[1] == [1, 2, 3, 4])
|
|
print("pattern.metrics.histogram()")
|
|
|
|
def test_moment(self):
|
|
# Assert 0.0 (1st central moment = 0.0).
|
|
v = metrics.moment([1, 2, 3, 4, 5], n=1)
|
|
self.assertEqual(v, 0.0)
|
|
# Assert 2.0 (2nd central moment = population variance).
|
|
v = metrics.moment([1, 2, 3, 4, 5], n=2)
|
|
self.assertEqual(v, 2.0)
|
|
print("pattern.metrics.moment()")
|
|
|
|
def test_skewness(self):
|
|
# Assert < 0.0 (few low values).
|
|
v = metrics.skewness([1, 100, 101, 102, 103])
|
|
self.assertTrue(v < 0.0)
|
|
# Assert > 0.0 (few high values).
|
|
v = metrics.skewness([1, 2, 3, 4, 100])
|
|
self.assertTrue(v > 0.0)
|
|
# Assert 0.0 (evenly distributed).
|
|
v = metrics.skewness([1, 2, 3, 4])
|
|
self.assertTrue(v == 0.0)
|
|
print("pattern.metrics.skewness()")
|
|
|
|
def test_kurtosis(self):
|
|
# Assert -1.2 for the uniform distribution.
|
|
a = 1
|
|
b = 1000
|
|
v = metrics.kurtosis([float(i - a) / (b - a) for i in range(a, b)])
|
|
self.assertAlmostEqual(v, -1.2, places=3)
|
|
print("pattern.metrics.kurtosis()")
|
|
|
|
def test_quantile(self):
|
|
# Assert 2.5 (quantile with p=0.5 == median).
|
|
v = metrics.quantile([1, 2, 3, 4], p=0.5, a=1, b=-1, c=0, d=1)
|
|
self.assertEqual(v, 2.5)
|
|
# Assert 3.0 (discontinuous sample).
|
|
v = metrics.quantile([1, 2, 3, 4], p=0.5, a=0.5, b=0, c=1, d=0)
|
|
self.assertEqual(v, 3.0)
|
|
return "pattern.metrics.quantile()"
|
|
|
|
def test_boxplot(self):
|
|
# Different a,b,c,d quantile parameters produce different results.
|
|
# By approximation, assert (53, 79.5, 84.5, 92, 98).
|
|
a = [79, 53, 82, 91, 87, 98, 80, 93]
|
|
v = metrics.boxplot(a)
|
|
self.assertEqual(v[0], min(a))
|
|
self.assertTrue(abs(v[1] - 79.5) <= 0.5)
|
|
self.assertTrue(abs(v[2] - metrics.median(a)) <= 0.5)
|
|
self.assertTrue(abs(v[3] - 92.0) <= 0.5)
|
|
self.assertEqual(v[4], max(a))
|
|
print("pattern.metrics.boxplot()")
|
|
|
|
|
|
class TestStatisticalTests(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_fisher_test(self):
|
|
# Assert Fisher exact test significance.
|
|
v = metrics.fisher_exact_test(a=1, b=9, c=11, d=3)
|
|
self.assertAlmostEqual(v, 0.0028, places=4)
|
|
v = metrics.fisher_exact_test(a=45, b=15, c=75, d=45)
|
|
self.assertAlmostEqual(v, 0.1307, places=4)
|
|
print("pattern.metrics.fisher_test()")
|
|
|
|
def test_chi_squared(self):
|
|
# Assert chi-squared test (upper tail).
|
|
o1, e1 = [[44, 56]], [[50, 50]]
|
|
o2, e2 = [[22, 21, 22, 27, 22, 36]], []
|
|
o3, e3 = [[48, 35, 15, 3]], [[58, 34.5, 7, 0.5]]
|
|
o4, e4 = [[36, 14], [30, 25]], []
|
|
o5, e5 = [[46, 71], [37, 83]], [[40.97, 76.02], [42.03, 77.97]]
|
|
v1 = metrics.chi2(o1, e1)
|
|
v2 = metrics.chi2(o2, e2)
|
|
v3 = metrics.chi2(o3, e3)
|
|
v4 = metrics.chi2(o4, e4)
|
|
v5 = metrics.chi2(o5, e5)
|
|
self.assertAlmostEqual(v1[0], 1.4400, places=4)
|
|
self.assertAlmostEqual(v1[1], 0.2301, places=4)
|
|
self.assertAlmostEqual(v2[0], 6.7200, places=4)
|
|
self.assertAlmostEqual(v2[1], 0.2423, places=4)
|
|
self.assertAlmostEqual(v3[0], 23.3742, places=4)
|
|
self.assertAlmostEqual(v4[0], 3.4177, places=4)
|
|
self.assertAlmostEqual(v5[0], 1.8755, places=4)
|
|
print("pattern.metrics.chi2()")
|
|
|
|
def test_chi_squared_p(self):
|
|
# Assert chi-squared P-value (upper tail).
|
|
for df, X2 in [
|
|
(1, (3.85, 5.05, 6.65, 7.90)),
|
|
(2, (6.00, 7.40, 9.25, 10.65)),
|
|
(3, (7.85, 9.40, 11.35, 12.85)),
|
|
(4, (9.50, 11.15, 13.30, 14.90)),
|
|
(5, (11.10, 12.85, 15.10, 16.80))]:
|
|
for i, x2 in enumerate(X2):
|
|
v = metrics.chi2p(x2, df, tail=metrics.UPPER)
|
|
self.assertTrue(v < (0.05, 0.025, 0.01, 0.005)[i])
|
|
print("pattern.metrics.chi2p()")
|
|
|
|
def test_kolmogorov_smirnov(self):
|
|
v = metrics.ks2([1, 2, 3], [1, 2, 4])
|
|
self.assertAlmostEqual(v[0], 0.3333, places=4)
|
|
self.assertAlmostEqual(v[1], 0.9762, places=4)
|
|
print("pattern.metrics.ks2()")
|
|
|
|
|
|
class TestSpecialFunctions(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_gamma(self):
|
|
# Assert complete gamma function.
|
|
v = metrics.gamma(0.5)
|
|
self.assertAlmostEqual(v, math.sqrt(math.pi), places=4)
|
|
print("pattern.metrics.gamma()")
|
|
|
|
def test_gammai(self):
|
|
# Assert incomplete gamma function.
|
|
v = metrics.gammai(a=1, x=2)
|
|
self.assertAlmostEqual(v, 0.1353, places=4)
|
|
print("pattern.metrics.gammai()")
|
|
|
|
def test_erfc(self):
|
|
# Assert complementary error function.
|
|
for x, y in [
|
|
(-3.00, 2.000),
|
|
(-2.00, 1.995),
|
|
(-1.00, 1.843),
|
|
(-0.50, 1.520),
|
|
(-0.25, 1.276),
|
|
( 0.00, 1.000),
|
|
( 0.25, 0.724),
|
|
( 0.50, 0.480),
|
|
( 1.00, 0.157),
|
|
( 2.00, 0.005),
|
|
( 3.00, 0.000)]:
|
|
self.assertAlmostEqual(metrics.erfc(x), y, places=3)
|
|
print("pattern.metrics.erfc()")
|
|
|
|
def test_kolmogorov(self):
|
|
# Assert Kolmogorov limit distribution.
|
|
self.assertAlmostEqual(metrics.kolmogorov(0.0), 1.0000, places=4)
|
|
self.assertAlmostEqual(metrics.kolmogorov(0.5), 0.9639, places=4)
|
|
self.assertAlmostEqual(metrics.kolmogorov(1.0), 0.2700, places=4)
|
|
self.assertAlmostEqual(metrics.kolmogorov(2.0), 0.0007, places=4)
|
|
self.assertAlmostEqual(metrics.kolmogorov(4.0), 0.0000, places=4)
|
|
print("pattern.metrics.kolmogorov()")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
def suite():
|
|
suite = unittest.TestSuite()
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestProfiling))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestTextMetrics))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInterpolation))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestStatistics))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestStatisticalTests))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSpecialFunctions))
|
|
return suite
|
|
|
|
if __name__ == "__main__":
|
|
|
|
result = unittest.TextTestRunner(verbosity=1).run(suite())
|
|
sys.exit(not result.wasSuccessful())
|