You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1044 lines
44 KiB
Python
1044 lines
44 KiB
Python
5 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from __future__ import print_function
|
||
|
from __future__ import unicode_literals
|
||
|
from __future__ import division
|
||
|
|
||
|
from builtins import str, bytes, dict, int
|
||
|
from builtins import map, zip, filter
|
||
|
from builtins import object, range
|
||
|
|
||
|
from io import open
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||
|
import time
|
||
|
import random
|
||
|
import unittest
|
||
|
|
||
|
from random import seed
|
||
|
seed(0)
|
||
|
|
||
|
from pattern import vector
|
||
|
|
||
|
from pattern.en import Text, Sentence, Word, parse
|
||
|
from pattern.db import Datasheet
|
||
|
|
||
|
try:
|
||
|
PATH = os.path.dirname(os.path.realpath(__file__))
|
||
|
except:
|
||
|
PATH = ""
|
||
|
|
||
|
|
||
|
def model(top=None):
|
||
|
""" Returns a Model of e-mail messages.
|
||
|
Document type=True => HAM, False => SPAM.
|
||
|
Documents are mostly of a technical nature (developer forum posts).
|
||
|
"""
|
||
|
documents = []
|
||
|
for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")):
|
||
|
document = vector.Document(message, stemmer="porter", top=top, type=int(score) > 0)
|
||
|
documents.append(document)
|
||
|
return vector.Model(documents)
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestUnicode(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test data with different (or wrong) encodings.
|
||
|
self.strings = (
|
||
|
"ünîcøde",
|
||
|
"ünîcøde".encode("utf-16"),
|
||
|
"ünîcøde".encode("latin-1"),
|
||
|
"ünîcøde".encode("windows-1252"),
|
||
|
"ünîcøde",
|
||
|
"אוניקאָד"
|
||
|
)
|
||
|
|
||
|
def test_decode_utf8(self):
|
||
|
# Assert unicode.
|
||
|
for s in self.strings:
|
||
|
self.assertTrue(isinstance(vector.decode_utf8(s), str))
|
||
|
print("pattern.vector.decode_utf8()")
|
||
|
|
||
|
def test_encode_utf8(self):
|
||
|
# Assert Python bytestring.
|
||
|
for s in self.strings:
|
||
|
self.assertTrue(isinstance(vector.encode_utf8(s), bytes))
|
||
|
print("pattern.vector.encode_utf8()")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestUtilityFunctions(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
pass
|
||
|
|
||
|
def test_shi(self):
|
||
|
# Assert integer hashing algorithm.
|
||
|
for a, b in (
|
||
|
( 100, "1c"),
|
||
|
( 1000, "G8"),
|
||
|
( 10000, "2bI"),
|
||
|
(100000, "Q0u")):
|
||
|
self.assertEqual(vector.shi(a), b)
|
||
|
print("pattern.vector.shi()")
|
||
|
|
||
|
def test_shuffled(self):
|
||
|
# Assert shuffled() <=> sorted().
|
||
|
v1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||
|
v2 = vector.shuffled(v1)
|
||
|
self.assertTrue(v1 != v2 and v1 == sorted(v2))
|
||
|
print("pattern.vector.shuffled()")
|
||
|
|
||
|
def test_chunk(self):
|
||
|
# Assert list chunk (near-)equal size.
|
||
|
for a, n, b in (
|
||
|
([1, 2, 3, 4, 5], 0, []),
|
||
|
([1, 2, 3, 4, 5], 1, [[1, 2, 3, 4, 5]]),
|
||
|
([1, 2, 3, 4, 5], 2, [[1, 2, 3], [4, 5]]),
|
||
|
([1, 2, 3, 4, 5], 3, [[1, 2], [3, 4], [5]]),
|
||
|
([1, 2, 3, 4, 5], 4, [[1, 2], [3], [4], [5]]),
|
||
|
([1, 2, 3, 4, 5], 5, [[1], [2], [3], [4], [5]]),
|
||
|
([1, 2, 3, 4, 5], 6, [[1], [2], [3], [4], [5], []])):
|
||
|
self.assertEqual(list(vector.chunk(a, n)), b)
|
||
|
print("pattern.vector.chunk()")
|
||
|
|
||
|
def test_readonlydict(self):
|
||
|
# Assert read-only dict.
|
||
|
v = vector.readonlydict({"a": 1})
|
||
|
self.assertTrue(isinstance(v, dict))
|
||
|
self.assertRaises(vector.ReadOnlyError, v.__setitem__, "a", 2)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.__delitem__, "a")
|
||
|
self.assertRaises(vector.ReadOnlyError, v.pop, "a")
|
||
|
self.assertRaises(vector.ReadOnlyError, v.popitem, ("a", 2))
|
||
|
self.assertRaises(vector.ReadOnlyError, v.clear)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.update, {"b": 2})
|
||
|
self.assertRaises(vector.ReadOnlyError, v.setdefault, "b", 2)
|
||
|
print("pattern.vector.readonlydict")
|
||
|
|
||
|
def test_readonlylist(self):
|
||
|
# Assert read-only list.
|
||
|
v = vector.readonlylist([1, 2])
|
||
|
self.assertTrue(isinstance(v, list))
|
||
|
self.assertRaises(vector.ReadOnlyError, v.__setitem__, 0, 0)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.__delitem__, 0)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.append, 3)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.insert, 2, 3)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.extend, [3, 4])
|
||
|
self.assertRaises(vector.ReadOnlyError, v.remove, 1)
|
||
|
self.assertRaises(vector.ReadOnlyError, v.pop, 0)
|
||
|
print("pattern.vector.readonlylist")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestStemmer(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test data from http://snowball.tartarus.org/algorithms/english/stemmer.html
|
||
|
self.input = [
|
||
|
'consign', 'consigned', 'consigning', 'consignment', 'consist', 'consisted', 'consistency',
|
||
|
'consistent', 'consistently', 'consisting', 'consists', 'consolation', 'consolations',
|
||
|
'consolatory', 'console', 'consoled', 'consoles', 'consolidate', 'consolidated', 'consolidating',
|
||
|
'consoling', 'consolingly', 'consols', 'consonant', 'consort', 'consorted', 'consorting',
|
||
|
'conspicuous', 'conspicuously', 'conspiracy', 'conspirator', 'conspirators', 'conspire',
|
||
|
'conspired', 'conspiring', 'constable', 'constables', 'constance', 'constancy', 'constant',
|
||
|
'generate', 'generates', 'generated', 'generating', 'general', 'generally', 'generic',
|
||
|
'generically', 'generous', 'generously', 'knack', 'knackeries', 'knacks', 'knag', 'knave',
|
||
|
'knaves', 'knavish', 'kneaded', 'kneading', 'knee', 'kneel', 'kneeled', 'kneeling', 'kneels',
|
||
|
'knees', 'knell', 'knelt', 'knew', 'knick', 'knif', 'knife', 'knight', 'knightly', 'knights',
|
||
|
'knit', 'knits', 'knitted', 'knitting', 'knives', 'knob', 'knobs', 'knock', 'knocked', 'knocker',
|
||
|
'knockers', 'knocking', 'knocks', 'knopp', 'knot', 'knots', 'skies', 'spy'
|
||
|
]
|
||
|
self.output = [
|
||
|
'consign', 'consign', 'consign', 'consign', 'consist', 'consist', 'consist', 'consist', 'consist',
|
||
|
'consist', 'consist', 'consol', 'consol', 'consolatori', 'consol', 'consol', 'consol', 'consolid',
|
||
|
'consolid', 'consolid', 'consol', 'consol', 'consol', 'conson', 'consort', 'consort', 'consort',
|
||
|
'conspicu', 'conspicu', 'conspiraci', 'conspir', 'conspir', 'conspir', 'conspir', 'conspir',
|
||
|
'constabl', 'constabl', 'constanc', 'constanc', 'constant', 'generat', 'generat', 'generat',
|
||
|
'generat', 'general', 'general', 'generic', 'generic', 'generous', 'generous', 'knack', 'knackeri',
|
||
|
'knack', 'knag', 'knave', 'knave', 'knavish', 'knead', 'knead', 'knee', 'kneel', 'kneel', 'kneel',
|
||
|
'kneel', 'knee', 'knell', 'knelt', 'knew', 'knick', 'knif', 'knife', 'knight', 'knight', 'knight',
|
||
|
'knit', 'knit', 'knit', 'knit', 'knive', 'knob', 'knob', 'knock', 'knock', 'knocker', 'knocker',
|
||
|
'knock', 'knock', 'knopp', 'knot', 'knot', 'sky', 'spi'
|
||
|
]
|
||
|
|
||
|
def test_stem(self):
|
||
|
# Assert the accuracy of the stemmer.
|
||
|
i = 0
|
||
|
n = len(self.input)
|
||
|
for a, b in zip(self.input, self.output):
|
||
|
if vector.stemmer.stem(a, cached=True) == b:
|
||
|
i += 1
|
||
|
self.assertEqual(float(i) / n, 1.0)
|
||
|
print("pattern.vector.stemmer.stem()")
|
||
|
|
||
|
def test_stem_case_sensitive(self):
|
||
|
# Assert stemmer case-sensitivity.
|
||
|
for a, b in (
|
||
|
("Ponies", "Poni"),
|
||
|
("pONIES", "pONI"),
|
||
|
("SKiES", "SKy"),
|
||
|
("cosmos", "cosmos")):
|
||
|
self.assertEqual(vector.stemmer.stem(a), b)
|
||
|
print("pattern.vector.stemmer.case_sensitive()")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestDocument(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test file for loading and saving documents.
|
||
|
self.path = "test_document2.txt"
|
||
|
|
||
|
def tearDown(self):
|
||
|
if os.path.exists(self.path):
|
||
|
os.remove(self.path)
|
||
|
|
||
|
def test_stopwords(self):
|
||
|
# Assert common stop words.
|
||
|
for w in ("a", "am", "an", "and", "i", "the", "therefore", "they", "what", "while"):
|
||
|
self.assertTrue(w in vector.stopwords["en"])
|
||
|
print("pattern.vector.stopwords")
|
||
|
|
||
|
def test_words(self):
|
||
|
# Assert word split algorithm (default treats lines as spaces and ignores numbers).
|
||
|
s = "The cat sat on the\nmat. 1 11."
|
||
|
v = vector.words(s, filter=lambda w: w.isalpha())
|
||
|
self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
|
||
|
# Assert custom word filter.
|
||
|
v = vector.words(s, filter=lambda w: True)
|
||
|
self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
|
||
|
print("pattern.vector.words()")
|
||
|
|
||
|
def test_stem(self):
|
||
|
# Assert stem with PORTER, LEMMA and pattern.en.Word.
|
||
|
s = "WOLVES"
|
||
|
v1 = vector.stem(s, stemmer=None)
|
||
|
v2 = vector.stem(s, stemmer=vector.PORTER)
|
||
|
v3 = vector.stem(s, stemmer=vector.LEMMA)
|
||
|
v4 = vector.stem(s, stemmer=lambda w: "wolf*")
|
||
|
v5 = vector.stem(Word(None, s, lemma="wolf*"), stemmer=vector.LEMMA)
|
||
|
v6 = vector.stem(Word(None, s, type="NNS"), stemmer=vector.LEMMA)
|
||
|
self.assertEqual(v1, "wolves")
|
||
|
self.assertEqual(v2, "wolv")
|
||
|
self.assertEqual(v3, "wolf")
|
||
|
self.assertEqual(v4, "wolf*")
|
||
|
self.assertEqual(v5, "wolf*")
|
||
|
self.assertEqual(v6, "wolf")
|
||
|
# Assert unicode output.
|
||
|
self.assertTrue(isinstance(v1, str))
|
||
|
self.assertTrue(isinstance(v2, str))
|
||
|
self.assertTrue(isinstance(v3, str))
|
||
|
self.assertTrue(isinstance(v4, str))
|
||
|
self.assertTrue(isinstance(v5, str))
|
||
|
self.assertTrue(isinstance(v6, str))
|
||
|
print("pattern.vector.stem()")
|
||
|
|
||
|
def test_count(self):
|
||
|
# Assert wordcount with stemming, stopwords and pruning.
|
||
|
w = ["The", "cats", "sat", "on", "the", "mat", "."]
|
||
|
v1 = vector.count(w)
|
||
|
v2 = vector.count(w, stemmer=vector.LEMMA)
|
||
|
v3 = vector.count(w, exclude=["."])
|
||
|
v4 = vector.count(w, stopwords=True)
|
||
|
v5 = vector.count(w, stopwords=True, top=3)
|
||
|
v6 = vector.count(w, stopwords=True, top=3, threshold=1)
|
||
|
v7 = vector.count(w, dict=vector.readonlydict, cached=False)
|
||
|
self.assertEqual(v1, {"cats": 1, "sat": 1, "mat": 1, ".": 1})
|
||
|
self.assertEqual(v2, {"cat": 1, "sat": 1, "mat": 1, ".": 1})
|
||
|
self.assertEqual(v3, {"cats": 1, "sat": 1, "mat": 1})
|
||
|
self.assertEqual(v4, {"the": 2, "cats": 1, "sat": 1, "on": 1, "mat": 1, ".": 1})
|
||
|
self.assertEqual(v5, {"the": 2, "cats": 1, ".": 1})
|
||
|
self.assertEqual(v6, {"the": 2})
|
||
|
# Assert custom dict class.
|
||
|
self.assertTrue(isinstance(v7, vector.readonlydict))
|
||
|
print("pattern.vector.count()")
|
||
|
|
||
|
def test_document(self):
|
||
|
# Assert Document properties.
|
||
|
# Test with different input types.
|
||
|
for constructor, w in (
|
||
|
(vector.Document, "The cats sit on the mat."),
|
||
|
(vector.Document, ["The", "cats", "sit", "on", "the", "mat"]),
|
||
|
(vector.Document, {"cat": 1, "mat": 1, "sit": 1}),
|
||
|
(vector.Document, Text(parse("The cats sat on the mat."))),
|
||
|
(vector.Document, Sentence(parse("The cats sat on the mat.")))):
|
||
|
# Test copy.
|
||
|
v = constructor(w, stemmer=vector.LEMMA, stopwords=False, name="Cat", type="CAT")
|
||
|
v = v.copy()
|
||
|
# Test properties.
|
||
|
self.assertEqual(v.name, "Cat")
|
||
|
self.assertEqual(v.type, "CAT")
|
||
|
self.assertEqual(v.count, 3)
|
||
|
self.assertEqual(v.terms, {"cat": 1, "mat": 1, "sit": 1})
|
||
|
# Test iterator decoration.
|
||
|
self.assertEqual(sorted(v.features), ["cat", "mat", "sit"])
|
||
|
self.assertEqual(sorted(v), ["cat", "mat", "sit"])
|
||
|
self.assertEqual(len(v), 3)
|
||
|
self.assertEqual(v["cat"], 1)
|
||
|
self.assertEqual("cat" in v, True)
|
||
|
print("pattern.vector.Document")
|
||
|
|
||
|
def test_document_load(self):
|
||
|
# Assert save + load document integrity.
|
||
|
v1 = "The cats are purring on the mat."
|
||
|
v1 = vector.Document(v1, stemmer=vector.PORTER, stopwords=True, name="Cat", type="CAT")
|
||
|
v1.save(self.path)
|
||
|
v2 = vector.Document.load(self.path)
|
||
|
self.assertEqual(v1.name, v2.name)
|
||
|
self.assertEqual(v1.type, v2.type)
|
||
|
self.assertEqual(v1.vector, v2.vector)
|
||
|
print("pattern.vector.Document.save()")
|
||
|
print("pattern.vector.Document.load()")
|
||
|
|
||
|
def test_document_vector(self):
|
||
|
# Assert Vector properties.
|
||
|
# Test copy.
|
||
|
v = vector.Document("the cat sat on the mat").vector
|
||
|
v = v.copy()
|
||
|
# Test properties.
|
||
|
self.assertTrue(isinstance(v, dict))
|
||
|
self.assertTrue(isinstance(v, vector.Vector))
|
||
|
self.assertTrue(isinstance(v.id, int))
|
||
|
self.assertEqual(sorted(v.features), ["cat", "mat", "sat"])
|
||
|
self.assertEqual(v.weight, vector.TF)
|
||
|
self.assertAlmostEqual(v.norm, 0.58, places=2)
|
||
|
self.assertAlmostEqual(v["cat"], 0.33, places=2)
|
||
|
self.assertAlmostEqual(v["sat"], 0.33, places=2)
|
||
|
self.assertAlmostEqual(v["mat"], 0.33, places=2)
|
||
|
# Test copy + update.
|
||
|
v = v({"cat": 1, "sat": 1, "mat": 1})
|
||
|
self.assertEqual(sorted(v.features), ["cat", "mat", "sat"])
|
||
|
self.assertAlmostEqual(v["cat"], 1.00, places=2)
|
||
|
self.assertAlmostEqual(v["sat"], 1.00, places=2)
|
||
|
self.assertAlmostEqual(v["mat"], 1.00, places=2)
|
||
|
print("pattern.vector.Document.vector")
|
||
|
|
||
|
def test_document_keywords(self):
|
||
|
# Assert Document.keywords() based on term frequency.
|
||
|
v = vector.Document(["cat", "cat", "cat", "sat", "sat", "mat"]).keywords(top=2)
|
||
|
self.assertEqual(len(v), 2)
|
||
|
self.assertEqual(v[0][1], "cat")
|
||
|
self.assertEqual(v[1][1], "sat")
|
||
|
self.assertAlmostEqual(v[0][0], 0.50, places=2)
|
||
|
self.assertAlmostEqual(v[1][0], 0.33, places=2)
|
||
|
print("pattern.vector.Document.keywords()")
|
||
|
|
||
|
def test_tf(self):
|
||
|
# Assert Document.term_frequency() (= weights used in Vector for orphaned documents).
|
||
|
v = vector.Document("the cat sat on the mat")
|
||
|
for feature, weight in v.vector.items():
|
||
|
self.assertEqual(v.term_frequency(feature), weight)
|
||
|
self.assertAlmostEqual(v.term_frequency(feature), 0.33, places=2)
|
||
|
print("pattern.vector.Document.tf()")
|
||
|
|
||
|
def test_tfidf(self):
|
||
|
# Assert tf-idf for documents not in a model.
|
||
|
v = [[0.0, 0.4, 0.6], [0.6, 0.4, 0.0]]
|
||
|
v = [dict(enumerate(v)) for v in v]
|
||
|
m = vector.Model([vector.Document(x) for x in v], weight=vector.TFIDF)
|
||
|
v = [vector.sparse(v) for v in vector.tf_idf(v)]
|
||
|
self.assertEqual(sorted(m[0].vector.items()), sorted(v[0].items()))
|
||
|
self.assertAlmostEqual(v[0][2], 0.42, places=2)
|
||
|
self.assertAlmostEqual(v[1][0], 0.42, places=2)
|
||
|
print("pattern.vector.tf_idf()")
|
||
|
|
||
|
def test_cosine_similarity(self):
|
||
|
# Test cosine similarity for documents not in a model.
|
||
|
v1 = vector.Document("the cat sat on the mat")
|
||
|
v2 = vector.Document("a cat with a hat")
|
||
|
self.assertAlmostEqual(v1.cosine_similarity(v2), 0.41, places=2)
|
||
|
print("pattern.vector.Document.similarity()")
|
||
|
print("pattern.vector.cosine_similarity()")
|
||
|
print("pattern.vector.l2_norm()")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestModel(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test model.
|
||
|
self.model = vector.Model(documents=(
|
||
|
vector.Document("cats purr", name="cat1", type="cåt"),
|
||
|
vector.Document("cats meow", name="cat2", type="cåt"),
|
||
|
vector.Document("dogs howl", name="dog1", type="døg"),
|
||
|
vector.Document("dogs bark", name="dog2", type="døg")
|
||
|
))
|
||
|
|
||
|
def test_model(self):
|
||
|
# Assert Model properties.
|
||
|
v = self.model
|
||
|
self.assertEqual(list(v), v.documents)
|
||
|
self.assertEqual(len(v), 4)
|
||
|
self.assertEqual(sorted(v.terms), ["bark", "cats", "dogs", "howl", "meow", "purr"])
|
||
|
self.assertEqual(sorted(v.terms), sorted(v.vector.keys()))
|
||
|
self.assertEqual(v.weight, vector.TFIDF)
|
||
|
self.assertEqual(v.lsa, None)
|
||
|
self.assertEqual(v.vectors, [d.vector for d in v.documents])
|
||
|
self.assertAlmostEqual(v.density, 0.22, places=2)
|
||
|
print("pattern.vector.Model")
|
||
|
|
||
|
def test_model_append(self):
|
||
|
# Assert Model.append().
|
||
|
self.assertRaises(vector.ReadOnlyError, self.model.documents.append, None)
|
||
|
self.model.append(vector.Document("birds chirp", name="bird"))
|
||
|
self.assertEqual(self.model[0]._vector, None)
|
||
|
self.assertEqual(len(self.model), 5)
|
||
|
self.model.remove(self.model.document("bird"))
|
||
|
print("pattern.vector.Model.append()")
|
||
|
|
||
|
def test_model_save(self):
|
||
|
# Assert Model save & load.
|
||
|
self.model.save("test_model.pickle", update=True)
|
||
|
self.model._update()
|
||
|
model = vector.Model.load("test_model.pickle")
|
||
|
# Assert that the precious cache is saved and reloaded.
|
||
|
self.assertTrue(len(model._df) > 0)
|
||
|
self.assertTrue(len(model._cos) > 0)
|
||
|
self.assertTrue(len(model.vectors) > 0)
|
||
|
os.remove("test_model.pickle")
|
||
|
print("pattern.vector.Model.save()")
|
||
|
print("pattern.vector.Model.load()")
|
||
|
|
||
|
def test_model_export(self):
|
||
|
# Assert Orange and Weka ARFF export formats.
|
||
|
for format, src in (
|
||
|
(vector.ORANGE,
|
||
|
"bark\tcats\tdogs\thowl\tmeow\tpurr\tm#name\tc#type\n"
|
||
|
"0\t0.3466\t0\t0\t0\t0.6931\tcat1\tcåt\n"
|
||
|
"0\t0.3466\t0\t0\t0.6931\t0\tcat2\tcåt\n"
|
||
|
"0\t0\t0.3466\t0.6931\t0\t0\tdog1\tdøg\n"
|
||
|
"0.6931\t0\t0.3466\t0\t0\t0\tdog2\tdøg"),
|
||
|
(vector.WEKA,
|
||
|
"@RELATION 5885744\n"
|
||
|
"@ATTRIBUTE bark NUMERIC\n"
|
||
|
"@ATTRIBUTE cats NUMERIC\n"
|
||
|
"@ATTRIBUTE dogs NUMERIC\n"
|
||
|
"@ATTRIBUTE howl NUMERIC\n"
|
||
|
"@ATTRIBUTE meow NUMERIC\n"
|
||
|
"@ATTRIBUTE purr NUMERIC\n"
|
||
|
"@ATTRIBUTE class {døg,cåt}\n"
|
||
|
"@DATA\n0,0.3466,0,0,0,0.6931,cåt\n"
|
||
|
"0,0.3466,0,0,0.6931,0,cåt\n"
|
||
|
"0,0,0.3466,0.6931,0,0,døg\n"
|
||
|
"0.6931,0,0.3466,0,0,0,døg")):
|
||
|
self.model.export("test_%s.txt" % format, format=format)
|
||
|
v = open("test_%s.txt" % format, encoding="utf-8").read()
|
||
|
v = v.replace("\r\n", "\n")
|
||
|
for line in src.split("\n"):
|
||
|
self.assertTrue(line in src)
|
||
|
os.remove("test_%s.txt" % format)
|
||
|
print("pattern.vector.Model.export()")
|
||
|
|
||
|
def test_df(self):
|
||
|
# Assert document frequency: "cats" appears in 1/2 documents,"purr" in 1/4.
|
||
|
self.assertEqual(self.model.df("cats"), 0.50)
|
||
|
self.assertEqual(self.model.df("purr"), 0.25)
|
||
|
self.assertEqual(self.model.df("????"), 0.00)
|
||
|
print("pattern.vector.Model.df()")
|
||
|
|
||
|
def test_idf(self):
|
||
|
# Assert inverse document frequency: log(1/df).
|
||
|
self.assertAlmostEqual(self.model.idf("cats"), 0.69, places=2)
|
||
|
self.assertAlmostEqual(self.model.idf("purr"), 1.39, places=2)
|
||
|
self.assertEqual(self.model.idf("????"), None)
|
||
|
print("pattern.vector.Model.idf()")
|
||
|
|
||
|
def test_tfidf(self):
|
||
|
# Assert term frequency - inverse document frequency: tf * idf.
|
||
|
self.assertAlmostEqual(self.model[0].tfidf("cats"), 0.35, places=2) # 0.50 * 0.69
|
||
|
self.assertAlmostEqual(self.model[0].tfidf("purr"), 0.69, places=2) # 0.50 * 1.39
|
||
|
self.assertAlmostEqual(self.model[0].tfidf("????"), 0.00, places=2)
|
||
|
print("pattern.vector.Document.tfidf()")
|
||
|
|
||
|
def test_frequent_concept_sets(self):
|
||
|
# Assert Apriori algorithm.
|
||
|
v = self.model.frequent(threshold=0.5)
|
||
|
if sys.version > "3":
|
||
|
self.assertCountEqual(sorted(list(v.keys())), [frozenset(["dogs"]), frozenset(["cats"])])
|
||
|
else:
|
||
|
self.assertItemsEqual(sorted(list(v.keys())), [frozenset(["dogs"]), frozenset(["cats"])])
|
||
|
print("pattern.vector.Model.frequent()")
|
||
|
|
||
|
def test_cosine_similarity(self):
|
||
|
# Assert document cosine similarity.
|
||
|
v1 = self.model.similarity(self.model[0], self.model[1])
|
||
|
v2 = self.model.similarity(self.model[0], self.model[2])
|
||
|
v3 = self.model.similarity(self.model[0], vector.Document("cats cats"))
|
||
|
self.assertAlmostEqual(v1, 0.20, places=2)
|
||
|
self.assertAlmostEqual(v2, 0.00, places=2)
|
||
|
self.assertAlmostEqual(v3, 0.45, places=2)
|
||
|
# Assert that Model.similarity() is aware of LSA reduction.
|
||
|
self.model.reduce(2)
|
||
|
v1 = self.model.similarity(self.model[0], self.model[1])
|
||
|
v2 = self.model.similarity(self.model[0], self.model[2])
|
||
|
self.assertAlmostEqual(v1, 1.00, places=2)
|
||
|
self.assertAlmostEqual(v2, 0.00, places=2)
|
||
|
self.model.lsa = None
|
||
|
print("pattern.vector.Model.similarity()")
|
||
|
|
||
|
def test_nearest_neighbors(self):
|
||
|
# Assert document nearest-neighbor search.
|
||
|
v1 = self.model.neighbors(self.model[0])
|
||
|
v2 = self.model.neighbors(vector.Document("cats meow"))
|
||
|
v3 = self.model.neighbors(vector.Document("????"))
|
||
|
self.assertEqual(v1[0][1], self.model[1])
|
||
|
self.assertEqual(v2[0][1], self.model[1])
|
||
|
self.assertEqual(v2[1][1], self.model[0])
|
||
|
self.assertAlmostEqual(v1[0][0], 0.20, places=2)
|
||
|
self.assertAlmostEqual(v2[0][0], 0.95, places=2)
|
||
|
self.assertAlmostEqual(v2[1][0], 0.32, places=2)
|
||
|
self.assertTrue(len(v3) == 0)
|
||
|
print("pattern.vector.Model.neighbors()")
|
||
|
|
||
|
def test_search(self):
|
||
|
# Assert document vector space search.
|
||
|
v1 = self.model.search(self.model[0])
|
||
|
v2 = self.model.search(vector.Document("cats meow"))
|
||
|
v3 = self.model.search(vector.Document("????"))
|
||
|
v4 = self.model.search("meow")
|
||
|
v5 = self.model.search(["cats", "meow"])
|
||
|
self.assertEqual(v1, self.model.neighbors(self.model[0]))
|
||
|
self.assertEqual(v2[0][1], self.model[1])
|
||
|
self.assertEqual(v3, [])
|
||
|
self.assertEqual(v4[0][1], self.model[1])
|
||
|
self.assertEqual(v5[0][1], self.model[1])
|
||
|
self.assertAlmostEqual(v4[0][0], 0.89, places=2)
|
||
|
self.assertAlmostEqual(v5[0][0], 1.00, places=2)
|
||
|
print("pattern.vector.Model.search()")
|
||
|
|
||
|
def test_distance(self):
|
||
|
# Assert Model document distance.
|
||
|
v1 = self.model.distance(self.model[0], self.model[1], method=vector.COSINE)
|
||
|
v2 = self.model.distance(self.model[0], self.model[2], method=vector.COSINE)
|
||
|
v3 = self.model.distance(self.model[0], self.model[2], method=vector.EUCLIDEAN)
|
||
|
self.assertAlmostEqual(v1, 0.8, places=1)
|
||
|
self.assertAlmostEqual(v2, 1.0, places=1)
|
||
|
self.assertAlmostEqual(v3, 1.2, places=1)
|
||
|
print("pattern.vector.Model.distance()")
|
||
|
|
||
|
def test_cluster(self):
|
||
|
# Assert Model document clustering.
|
||
|
v1 = self.model.cluster(method=vector.KMEANS, k=10)
|
||
|
v2 = self.model.cluster(method=vector.HIERARCHICAL, k=1)
|
||
|
self.assertTrue(isinstance(v1, list) and len(v1) == 10)
|
||
|
self.assertTrue(isinstance(v2, vector.Cluster))
|
||
|
|
||
|
def _test_clustered_documents(cluster):
|
||
|
if self.model[0] in cluster:
|
||
|
self.assertTrue(self.model[1] in cluster \
|
||
|
and not self.model[2] in cluster)
|
||
|
if self.model[2] in cluster:
|
||
|
self.assertTrue(self.model[3] in cluster \
|
||
|
and not self.model[1] in cluster)
|
||
|
v2.traverse(_test_clustered_documents)
|
||
|
print("pattern.vector.Model.cluster()")
|
||
|
|
||
|
def test_centroid(self):
|
||
|
# Assert centroid of recursive Cluster.
|
||
|
v = vector.Cluster(({"a": 1}, vector.Cluster(({"a": 2}, {"a": 4}))))
|
||
|
self.assertAlmostEqual(vector.centroid(v)["a"], 2.33, places=2)
|
||
|
print("pattern.vector.centroid()")
|
||
|
|
||
|
def test_lsa(self):
|
||
|
# Assert Model.reduce() LSA reduction.
|
||
|
self.model.reduce(2)
|
||
|
self.assertTrue(isinstance(self.model.lsa, vector.LSA))
|
||
|
self.model.lsa = None
|
||
|
print("pattern.vector.Model.reduce()")
|
||
|
|
||
|
def test_feature_selection(self):
|
||
|
# Assert information gain feature selection.
|
||
|
m = vector.Model((
|
||
|
vector.Document("the cat sat on the mat", type="cat", stopwords=True),
|
||
|
vector.Document("the dog howled at the moon", type="dog", stopwords=True)
|
||
|
))
|
||
|
v = m.feature_selection(top=3, method=vector.IG, threshold=0.0)
|
||
|
self.assertEqual(v, ["at", "cat", "dog"])
|
||
|
# Assert Model.filter().
|
||
|
v = m.filter(v)
|
||
|
self.assertTrue("at" in v.terms)
|
||
|
self.assertTrue("cat" in v.terms)
|
||
|
self.assertTrue("dog" in v.terms)
|
||
|
self.assertTrue("the" not in v.terms)
|
||
|
self.assertTrue("mat" not in v.terms)
|
||
|
print("pattern.vector.Model.feature_selection()")
|
||
|
print("pattern.vector.Model.filter()")
|
||
|
|
||
|
def test_information_gain(self):
|
||
|
# Assert information gain weights.
|
||
|
# Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
|
||
|
m = vector.Model([
|
||
|
vector.Document({"wind": 1}, type=False),
|
||
|
vector.Document({"wind": 0}, type=True),
|
||
|
vector.Document({"wind": 0}, type=True),
|
||
|
vector.Document({"wind": 0}, type=True),
|
||
|
vector.Document({"wind": 1}, type=True),
|
||
|
vector.Document({"wind": 1}, type=False),
|
||
|
vector.Document({"wind": 1}, type=False)], weight=None
|
||
|
)
|
||
|
self.assertAlmostEqual(m.information_gain("wind"), 0.52, places=2)
|
||
|
# Example from http://rutcor.rutgers.edu/~amai/aimath02/PAPERS/14.pdf
|
||
|
m = vector.Model([
|
||
|
vector.Document({"3": 1}, type=True),
|
||
|
vector.Document({"3": 5}, type=True),
|
||
|
vector.Document({"3": 1}, type=False),
|
||
|
vector.Document({"3": 7}, type=True),
|
||
|
vector.Document({"3": 2}, type=False),
|
||
|
vector.Document({"3": 2}, type=True),
|
||
|
vector.Document({"3": 6}, type=False),
|
||
|
vector.Document({"3": 4}, type=True),
|
||
|
vector.Document({"3": 0}, type=False),
|
||
|
vector.Document({"3": 9}, type=True)], weight=None
|
||
|
)
|
||
|
self.assertAlmostEqual(m.ig("3"), 0.571, places=3)
|
||
|
self.assertAlmostEqual(m.gr("3"), 0.195, places=3)
|
||
|
print("patten.vector.Model.information_gain()")
|
||
|
print("patten.vector.Model.gain_ratio()")
|
||
|
|
||
|
def test_entropy(self):
|
||
|
# Assert Shannon entropy calculcation.
|
||
|
self.assertAlmostEqual(vector.entropy([1, 1]), 1.00, places=2)
|
||
|
self.assertAlmostEqual(vector.entropy([2, 1]), 0.92, places=2)
|
||
|
self.assertAlmostEqual(vector.entropy([0.5, 0.5]), 1.00, places=2)
|
||
|
self.assertAlmostEqual(vector.entropy([0.6]), 0.44, places=2)
|
||
|
print("pattern.vector.entropy()")
|
||
|
|
||
|
def test_condensed_nearest_neighbor(self):
|
||
|
# Assert CNN for data reduction.
|
||
|
v = vector.Model((
|
||
|
vector.Document("woof", type="dog"),
|
||
|
vector.Document("meow", type="cat"), # redundant
|
||
|
vector.Document("meow meow", type="cat")
|
||
|
))
|
||
|
self.assertTrue(len(v.cnn()) < len(v))
|
||
|
print("pattern.vector.Model.condensed_nearest_neighbor()")
|
||
|
|
||
|
def test_classifier(self):
|
||
|
# Assert that the model classifier is correctly saved and loaded.
|
||
|
p = "test.model.tmp"
|
||
|
v = vector.Model([vector.Document("chirp", type="bird")])
|
||
|
v.train(vector.SVM)
|
||
|
v.save(p)
|
||
|
v = vector.Model.load(p)
|
||
|
self.assertTrue(isinstance(v.classifier, vector.SVM))
|
||
|
os.unlink(p)
|
||
|
print("pattern.vector.Model.classifier")
|
||
|
print("pattern.vector.Model.train()")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestApriori(unittest.TestCase):
|
||
|
|
||
|
def setUp(self):
|
||
|
pass
|
||
|
|
||
|
def test_apriori(self):
|
||
|
# Assert frequent sets frequency.
|
||
|
v = vector.apriori((
|
||
|
[1, 2, 4],
|
||
|
[1, 2, 5],
|
||
|
[1, 3, 6],
|
||
|
[1, 3, 7]
|
||
|
), support=0.5)
|
||
|
self.assertTrue(len(v), 3)
|
||
|
self.assertEqual(v[frozenset((1, ))], 1.0)
|
||
|
self.assertEqual(v[frozenset((1, 2))], 0.5)
|
||
|
self.assertEqual(v[frozenset((2, ))], 0.5)
|
||
|
self.assertEqual(v[frozenset((3, ))], 0.5)
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestLSA(unittest.TestCase):
|
||
|
|
||
|
model = None
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test spam model for reduction.
|
||
|
if self.__class__.model is None:
|
||
|
self.__class__.model = model(top=250)
|
||
|
self.model = self.__class__.model
|
||
|
random.seed(0)
|
||
|
|
||
|
def tearDown(self):
|
||
|
random.seed()
|
||
|
|
||
|
def test_lsa(self):
|
||
|
# Assert LSA properties.
|
||
|
k = 100
|
||
|
lsa = vector.LSA(self.model, k)
|
||
|
self.assertEqual(lsa.model, self.model)
|
||
|
self.assertEqual(lsa.vectors, lsa.u)
|
||
|
self.assertEqual(set(lsa.terms), set(self.model.vector.keys()))
|
||
|
self.assertTrue(isinstance(lsa.u, dict))
|
||
|
self.assertTrue(isinstance(lsa.sigma, list))
|
||
|
self.assertTrue(isinstance(lsa.vt, list))
|
||
|
self.assertTrue(len(lsa.u), len(self.model))
|
||
|
self.assertTrue(len(lsa.sigma), len(self.model) - k)
|
||
|
self.assertTrue(len(lsa.vt), len(self.model) - k)
|
||
|
for document in self.model:
|
||
|
v = lsa.vectors[document.id]
|
||
|
self.assertTrue(isinstance(v, vector.Vector))
|
||
|
self.assertTrue(len(v) <= k)
|
||
|
print("pattern.vector.LSA")
|
||
|
|
||
|
def test_lsa_concepts(self):
|
||
|
# Assert LSA concept space.
|
||
|
model = vector.Model((
|
||
|
vector.Document("cats purr"),
|
||
|
vector.Document("cats meow"),
|
||
|
vector.Document("dogs howl"),
|
||
|
vector.Document("dogs bark")
|
||
|
))
|
||
|
model.reduce(2)
|
||
|
# Intuitively, we'd expect two concepts:
|
||
|
# 1) with cats + purr + meow grouped together,
|
||
|
# 2) with dogs + howl + bark grouped together.
|
||
|
i1, i2 = 0, 0
|
||
|
for i, concept in enumerate(model.lsa.concepts):
|
||
|
self.assertTrue(isinstance(concept, dict))
|
||
|
if concept["cats"] > 0.5:
|
||
|
self.assertTrue(concept["purr"] > 0.5)
|
||
|
self.assertTrue(concept["meow"] > 0.5)
|
||
|
self.assertTrue(concept["howl"] == 0.0)
|
||
|
self.assertTrue(concept["bark"] == 0.0)
|
||
|
i1 = i
|
||
|
if concept["dogs"] > 0.5:
|
||
|
self.assertTrue(concept["howl"] > 0.5)
|
||
|
self.assertTrue(concept["bark"] > 0.5)
|
||
|
self.assertTrue(concept["purr"] == 0.0)
|
||
|
self.assertTrue(concept["meow"] == 0.0)
|
||
|
i2 = i
|
||
|
# We'd expect the "cat" documents to score high on the "cat" concept vector.
|
||
|
# We'd expect the "dog" documents to score high on the "dog" concept vector.
|
||
|
v1 = model.lsa[model.documents[0].id]
|
||
|
v2 = model.lsa[model.documents[2].id]
|
||
|
self.assertTrue(v1.get(i1, 0) > 0.7)
|
||
|
self.assertTrue(v1.get(i2, 0) == 0.0)
|
||
|
self.assertTrue(v2.get(i1, 0) == 0.0)
|
||
|
self.assertTrue(v2.get(i2, 0) > 0.7)
|
||
|
# Assert LSA.transform() for unknown documents.
|
||
|
v = model.lsa.transform(vector.Document("cats dogs"))
|
||
|
self.assertAlmostEqual(v[0], 0.34, places=2)
|
||
|
self.assertAlmostEqual(v[1], 0.34, places=2)
|
||
|
print("pattern.vector.LSA.concepts")
|
||
|
print("pattern.vector.LSA.transform()")
|
||
|
|
||
|
def test_model_reduce(self):
|
||
|
# Test time and accuracy of model with sparse vectors of maximum 250 features.
|
||
|
t1 = time.time()
|
||
|
A1, P1, R1, F1, stdev = vector.KNN.test(self.model, folds=10)
|
||
|
t1 = time.time() - t1
|
||
|
# Test time and accuracy of model with reduced vectors of 20 features.
|
||
|
self.model.reduce(dimensions=20)
|
||
|
t2 = time.time()
|
||
|
A2, P2, R2, F2, stdev = vector.KNN.test(self.model, folds=10)
|
||
|
t2 = time.time() - t2
|
||
|
self.assertTrue(len(self.model.lsa[self.model.documents[0].id]) == 20)
|
||
|
self.assertTrue(t2 * 2 < t1) # KNN over 2x faster.
|
||
|
self.assertTrue(abs(F1 - F2) < 0.06) # Difference in F-score = 1-6%.
|
||
|
self.model.lsa = None
|
||
|
print("pattern.vector.Model.reduce()")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestClustering(unittest.TestCase):
|
||
|
|
||
|
model = None
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test spam model for clustering.
|
||
|
if self.__class__.model is None:
|
||
|
self.__class__.model = model(top=10)
|
||
|
self.model = self.__class__.model
|
||
|
random.seed(0)
|
||
|
|
||
|
def tearDown(self):
|
||
|
random.seed()
|
||
|
|
||
|
def test_features(self):
|
||
|
# Assert unique list of vector keys.
|
||
|
v = vector.features(vectors=[{"cat": 1}, {"dog": 1}])
|
||
|
self.assertEqual(sorted(v), ["cat", "dog"])
|
||
|
print("pattern.vector.features()")
|
||
|
|
||
|
def test_mean(self):
|
||
|
# Assert iterator mean.
|
||
|
self.assertEqual(vector.mean([], 0), 0)
|
||
|
self.assertEqual(vector.mean([1, 1.5, 2], 3), 1.5)
|
||
|
self.assertEqual(vector.mean(range(4), 4), 1.5)
|
||
|
print("pattern.vector.mean()")
|
||
|
|
||
|
def test_centroid(self):
|
||
|
# Assert center of list of vectors.
|
||
|
v = vector.centroid([{"cat": 1}, {"cat": 0.5, "dog": 1}], features=["cat", "dog"])
|
||
|
self.assertEqual(v, {"cat": 0.75, "dog": 0.5})
|
||
|
print("pattern.vector.centroid()")
|
||
|
|
||
|
def test_distance(self):
|
||
|
# Assert distance metrics.
|
||
|
v1 = vector.Vector({"cat": 1})
|
||
|
v2 = vector.Vector({"cat": 0.5, "dog": 1})
|
||
|
for d, method in (
|
||
|
(0.55, vector.COSINE), # 1 - ((1*0.5 + 0*1) / (sqrt(1**2 + 0**2) * sqrt(0.5**2 + 1**2)))
|
||
|
(1.25, vector.EUCLIDEAN), # (1-0.5)**2 + (0-1)**2
|
||
|
(1.50, vector.MANHATTAN), # abs(1-0.5) + abs(0-1)
|
||
|
(1.00, vector.HAMMING), # (True + True) / 2
|
||
|
(1.11, lambda v1, v2: 1.11)):
|
||
|
self.assertAlmostEqual(vector.distance(v1, v2, method), d, places=2)
|
||
|
print("pattern.vector.distance()")
|
||
|
|
||
|
def test_distancemap(self):
|
||
|
# Assert distance caching mechanism.
|
||
|
v1 = vector.Vector({"cat": 1})
|
||
|
v2 = vector.Vector({"cat": 0.5, "dog": 1})
|
||
|
m = vector.DistanceMap(method=vector.COSINE)
|
||
|
for i in range(100):
|
||
|
self.assertAlmostEqual(m.distance(v1, v2), 0.55, places=2)
|
||
|
self.assertAlmostEqual(m._cache[(v1.id, v2.id)], 0.55, places=2)
|
||
|
print("pattern.vector.DistanceMap")
|
||
|
|
||
|
def _test_k_means(self, seed):
|
||
|
# Assert k-means clustering accuracy.
|
||
|
A = []
|
||
|
n = 100
|
||
|
m = dict((d.vector.id, d.type) for d in self.model[:n])
|
||
|
for i in range(30):
|
||
|
# Create two clusters of vectors.
|
||
|
k = vector.kmeans([d.vector for d in self.model[:n]], k=2, seed=seed)
|
||
|
# Measure the number of spam in each clusters.
|
||
|
# Ideally, we have a cluster without spam and one with only spam.
|
||
|
i = len([1 for v in k[0] if m[v.id] == False])
|
||
|
j = len([1 for v in k[1] if m[v.id] == False])
|
||
|
A.append(max(i, j) * 2.0 / n)
|
||
|
# Return average accuracy after 10 tests.
|
||
|
return sum(A) / 30.0
|
||
|
|
||
|
def test_k_means_random(self):
|
||
|
# Assert k-means with random initialization.
|
||
|
v = self._test_k_means(seed=vector.RANDOM)
|
||
|
self.assertTrue(v >= 0.6)
|
||
|
print("pattern.vector.kmeans(seed=RANDOM)")
|
||
|
|
||
|
def test_k_means_kmpp(self):
|
||
|
# Assert k-means with k-means++ initialization.
|
||
|
# Note: vectors contain the top 10 features - see setUp().
|
||
|
# If you include more features (more noise?) accuracy and performance will drop.
|
||
|
v = self._test_k_means(seed=vector.KMPP)
|
||
|
self.assertTrue(v >= 0.8)
|
||
|
print("pattern.vector.kmeans(seed=KMPP)")
|
||
|
|
||
|
def test_hierarchical(self):
|
||
|
# Assert cluster contains nested clusters and/or vectors.
|
||
|
def _test_cluster(cluster):
|
||
|
for nested in cluster:
|
||
|
if isinstance(nested, vector.Cluster):
|
||
|
v1 = set((v.id for v in nested.flatten()))
|
||
|
v2 = set((v.id for v in cluster.flatten()))
|
||
|
self.assertTrue(nested.depth < cluster.depth)
|
||
|
self.assertTrue(v1.issubset(v2))
|
||
|
else:
|
||
|
self.assertTrue(isinstance(nested, vector.Vector))
|
||
|
self.assertTrue(isinstance(cluster, list))
|
||
|
self.assertTrue(isinstance(cluster.depth, int))
|
||
|
self.assertTrue(isinstance(cluster.flatten(), list))
|
||
|
n = 50
|
||
|
m = dict((d.vector.id, d.type) for d in self.model[:n])
|
||
|
h = vector.hierarchical([d.vector for d in self.model[:n]], k=2)
|
||
|
h.traverse(_test_cluster)
|
||
|
# Assert the accuracy of hierarchical clustering (shallow test).
|
||
|
# Assert that cats are separated from dogs.
|
||
|
v = (
|
||
|
vector.Vector({"feline": 1, " lion": 1, "mane": 1}),
|
||
|
vector.Vector({"feline": 1, "tiger": 1, "stripe": 1}),
|
||
|
vector.Vector({"canine": 1, "wolf": 1, "howl": 1}),
|
||
|
vector.Vector({"canine": 1, "dog": 1, "bark": 1})
|
||
|
)
|
||
|
h = vector.hierarchical(v)
|
||
|
self.assertTrue(len(h[0][0]) == 2)
|
||
|
self.assertTrue(len(h[0][1]) == 2)
|
||
|
self.assertTrue(v[0] in h[0][0] and v[1] in h[0][0] or v[0] in h[0][1] and v[1] in h[0][1])
|
||
|
self.assertTrue(v[2] in h[0][0] and v[3] in h[0][0] or v[2] in h[0][1] and v[3] in h[0][1])
|
||
|
print("pattern.vector.Cluster()")
|
||
|
print("pattern.vector.hierarchical()")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
class TestClassifier(unittest.TestCase):
|
||
|
|
||
|
model = None
|
||
|
|
||
|
def setUp(self):
|
||
|
# Test model for training classifiers.
|
||
|
if self.__class__.model is None:
|
||
|
self.__class__.model = model()
|
||
|
self.model = self.__class__.model
|
||
|
|
||
|
def _test_classifier(self, Classifier, **kwargs):
|
||
|
# Assert classifier training + prediction for trivial cases.
|
||
|
v = Classifier(**kwargs)
|
||
|
test_doc1 = None
|
||
|
test_doc2 = None
|
||
|
|
||
|
for document in self.model:
|
||
|
if isinstance(v, vector.IGTree):
|
||
|
if test_doc1 is None and document.type is True:
|
||
|
test_doc1 = document
|
||
|
if test_doc2 is None and document.type is False:
|
||
|
test_doc2 = document
|
||
|
v.train(document)
|
||
|
|
||
|
for type, message in (
|
||
|
(False, "win money"),
|
||
|
(True, "fix bug")):
|
||
|
if not isinstance(v, vector.IGTree):
|
||
|
self.assertEqual(v.classify(message), type)
|
||
|
|
||
|
if isinstance(v, vector.IGTree):
|
||
|
self.assertEqual(v.classify(test_doc1), True)
|
||
|
self.assertEqual(v.classify(test_doc2), False)
|
||
|
|
||
|
# Assert classifier properties.
|
||
|
self.assertEqual(v.binary, True)
|
||
|
self.assertEqual(sorted(v.classes), [False, True])
|
||
|
self.assertTrue(isinstance(v.features, list))
|
||
|
self.assertTrue("ftp" in v.features)
|
||
|
# Assert saving + loading.
|
||
|
v.save(Classifier.__name__)
|
||
|
v = Classifier.load(Classifier.__name__)
|
||
|
if not isinstance(v, vector.IGTree):
|
||
|
self.assertEqual(v.classify("win money"), False)
|
||
|
self.assertEqual(v.classify("fix bug"), True)
|
||
|
os.remove(Classifier.__name__)
|
||
|
# Assert untrained classifier returns None.
|
||
|
v = Classifier(**kwargs)
|
||
|
self.assertEqual(v.classify("herring"), None)
|
||
|
print("pattern.vector.%s.train()" % Classifier.__name__)
|
||
|
print("pattern.vector.%s.classify()" % Classifier.__name__)
|
||
|
print("pattern.vector.%s.save()" % Classifier.__name__)
|
||
|
|
||
|
def test_classifier_vector(self):
|
||
|
# Assert Classifier._vector() (translates input from train() and classify() to a Vector).
|
||
|
v = vector.Classifier()._vector
|
||
|
self.assertEqual(("cat", {"cat": 0.5, "purs": 0.5}), v(vector.Document("the cat purs", type="cat")))
|
||
|
self.assertEqual(("cat", {"cat": 0.5, "purs": 0.5}), v({"cat": 0.5, "purs": 0.5}, type="cat"))
|
||
|
self.assertEqual(("cat", {"cat": 0.5, "purs": 0.5}), v(["cat", "purs"], type="cat"))
|
||
|
self.assertEqual(("cat", {"cat": 0.5, "purs": 0.5}), v("cat purs", type="cat"))
|
||
|
print("pattern.vector.Classifier._vector()")
|
||
|
|
||
|
def test_nb(self):
|
||
|
# Assert Bayesian probability classification.
|
||
|
self._test_classifier(vector.NB)
|
||
|
# Assert the accuracy of the classifier.
|
||
|
A, P, R, F, o = vector.NB.test(self.model, folds=10, method=vector.BERNOUILLI)
|
||
|
#print(A, P, R, F, o)
|
||
|
self.assertTrue(P >= 0.88)
|
||
|
self.assertTrue(R >= 0.89)
|
||
|
self.assertTrue(F >= 0.88)
|
||
|
|
||
|
def test_igtree(self):
|
||
|
# Assert information gain tree classification.
|
||
|
self._test_classifier(vector.IGTree, method=vector.GAINRATIO)
|
||
|
# Assert the accuracy of the classifier.
|
||
|
A, P, R, F, o = vector.IGTREE.test(self.model, folds=10, method=vector.GAINRATIO)
|
||
|
#print(A, P, R, F, o)
|
||
|
self.assertTrue(P >= 0.87)
|
||
|
self.assertTrue(R >= 0.88)
|
||
|
self.assertTrue(F >= 0.89)
|
||
|
|
||
|
def test_knn(self):
|
||
|
# Assert nearest-neighbor classification.
|
||
|
self._test_classifier(vector.KNN, k=10, distance=vector.COSINE)
|
||
|
# Assert the accuracy of the classifier.
|
||
|
A, P, R, F, o = vector.KNN.test(self.model, folds=10, k=2, distance=vector.COSINE)
|
||
|
#print(A, P, R, F, o)
|
||
|
self.assertTrue(P >= 0.91)
|
||
|
self.assertTrue(R >= 0.92)
|
||
|
self.assertTrue(F >= 0.92)
|
||
|
|
||
|
def test_slp(self):
|
||
|
random.seed(1)
|
||
|
# Assert single-layer averaged perceptron classification.
|
||
|
self._test_classifier(vector.SLP)
|
||
|
# Assert the accuracy of the classifier.
|
||
|
A, P, R, F, o = vector.SLP.test(self.model, folds=10, iterations=3)
|
||
|
#print(A, P, R, F, o)
|
||
|
self.assertTrue(P >= 0.90)
|
||
|
self.assertTrue(R >= 0.91)
|
||
|
self.assertTrue(F >= 0.91)
|
||
|
|
||
|
def test_svm(self):
|
||
|
try:
|
||
|
from pattern.vector import svm
|
||
|
except ImportError as e:
|
||
|
print(e)
|
||
|
return
|
||
|
# Assert support vector classification.
|
||
|
self._test_classifier(vector.SVM, type=vector.SVC, kernel=vector.LINEAR)
|
||
|
# Assert the accuracy of the classifier.
|
||
|
A, P, R, F, o = vector.SVM.test(self.model, folds=10, type=vector.SVC, kernel=vector.LINEAR)
|
||
|
#print(A, P, R, F, o)
|
||
|
self.assertTrue(P >= 0.93)
|
||
|
self.assertTrue(R >= 0.93)
|
||
|
self.assertTrue(F >= 0.93)
|
||
|
|
||
|
def test_liblinear(self):
|
||
|
# If LIBLINEAR can be loaded,
|
||
|
# assert that it is used for linear SVC (= 10x faster).
|
||
|
try:
|
||
|
from pattern.vector import svm
|
||
|
except ImportError as e:
|
||
|
print(e)
|
||
|
return
|
||
|
if svm.LIBLINEAR:
|
||
|
classifier1 = vector.SVM(
|
||
|
type = vector.CLASSIFICATION,
|
||
|
kernel = vector.LINEAR,
|
||
|
extensions = (vector.LIBSVM, vector.LIBLINEAR))
|
||
|
classifier2 = vector.SVM(
|
||
|
type = vector.CLASSIFICATION,
|
||
|
kernel = vector.RBF,
|
||
|
extensions = (vector.LIBSVM, vector.LIBLINEAR))
|
||
|
classifier3 = vector.SVM(
|
||
|
type = vector.CLASSIFICATION,
|
||
|
kernel = vector.LINEAR,
|
||
|
extensions = (vector.LIBSVM,))
|
||
|
self.assertEqual(classifier1.extension, vector.LIBLINEAR)
|
||
|
self.assertEqual(classifier2.extension, vector.LIBSVM)
|
||
|
self.assertEqual(classifier3.extension, vector.LIBSVM)
|
||
|
print("pattern.vector.svm.LIBSVM")
|
||
|
print("pattern.vector.svm.LIBLINEAR")
|
||
|
|
||
|
#---------------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
def suite():
|
||
|
suite = unittest.TestSuite()
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestUnicode))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestUtilityFunctions))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestStemmer))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestDocument))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestModel))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestApriori))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestLSA))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestClustering))
|
||
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestClassifier))
|
||
|
return suite
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
|
||
|
result = unittest.TextTestRunner(verbosity=1).run(suite())
|
||
|
sys.exit(not result.wasSuccessful())
|