You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

307 lines
12 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import unittest
try:
# Python 2
from StringIO import StringIO
except ImportError:
# Python 3
from io import StringIO
from pattern import text
#---------------------------------------------------------------------------------------------------
class TestLexicon(unittest.TestCase):
def setUp(self):
pass
def test_lazydict(self):
# Assert lazy dictionary only has data after one of its methods is called.
class V(text.lazydict):
def load(self):
dict.__setitem__(self, "a", 1)
v = V()
self.assertTrue(dict.__len__(v) == 0)
self.assertTrue(dict.__contains__(v, "a") is False)
self.assertTrue(len(v), 1)
self.assertTrue(v["a"] == 1)
print("pattern.text.lazydict")
def test_lazylist(self):
# Assert lazy list only has data after one of its methods is called.
class V(text.lazylist):
def load(self):
list.append(self, "a")
v = V()
self.assertTrue(list.__len__(v) == 0)
self.assertTrue(list.__contains__(v, "a") is False)
self.assertTrue(len(v), 1)
self.assertTrue(v[0] == "a")
print("pattern.text.lazylist")
def test_lexicon(self):
# Assert lexicon from file (or file-like string).
f1 = ";;; Comments. \n schrödinger NNP \n cat NN"
f2 = StringIO(";;; Comments. \n schrödinger NNP \n cat NN")
v1 = text.Lexicon(path=f1)
v2 = text.Lexicon(path=f2)
self.assertEqual(v1["schrödinger"], "NNP")
self.assertEqual(v2["schrödinger"], "NNP")
print("pattern.text.Lexicon")
#---------------------------------------------------------------------------------------------------
class TestFrequency(unittest.TestCase):
def setUp(self):
pass
def test_frequency(self):
# Assert word frequency from file (or file-like string).
f1 = ";;; Comments. \n the 1.0000 \n of 0.5040"
f2 = StringIO(";;; Comments. \n the 1.0000 \n of 0.5040")
v1 = text.Frequency(path=f1)
v2 = text.Frequency(path=f2)
self.assertEqual(v1["of"], 0.504)
self.assertEqual(v2["of"], 0.504)
print("pattern.text.Frequency")
#---------------------------------------------------------------------------------------------------
class TestModel(unittest.TestCase):
def setUp(self):
pass
def test_model(self):
# Assert SLP language model.
v = text.Model()
for i in range(2):
v.train("black", "JJ", previous=("the", "DT"), next=("cat", "NN"))
v.train("on", "IN", previous=("sat", "VBD"), next=("the", "DT"))
self.assertEqual("JJ", v.classify("slack"))
self.assertEqual("JJ", v.classify("white", previous=("a", "DT"), next=("cat", "NN")))
self.assertEqual("IN", v.classify("on", previous=("sat", "VBD")))
self.assertEqual("IN", v.classify("on", next=("the", "")))
self.assertEqual(["white", "JJ"], v.apply(("white", ""), next=("cat", "")))
print("pattern.text.Model")
#---------------------------------------------------------------------------------------------------
class TestMorphology(unittest.TestCase):
def setUp(self):
pass
def test_morphology(self):
# Assert morphological tagging rules.
f = StringIO("NN s fhassuf 1 NNS x")
v = text.Morphology(f)
self.assertEqual(v.apply(
["cats", "NN"]),
["cats", "NNS"])
print("pattern.text.Morphology")
#---------------------------------------------------------------------------------------------------
class TestContext(unittest.TestCase):
def setUp(self):
pass
def test_context(self):
# Assert contextual tagging rules.
f = StringIO("VBD VB PREVTAG TO")
v = text.Context(path=f)
self.assertEqual(v.apply(
[["to", "TO"], ["be", "VBD"]]),
[["to", "TO"], ["be", "VB"]])
print("pattern.text.Context")
#---------------------------------------------------------------------------------------------------
class TestEntities(unittest.TestCase):
def setUp(self):
pass
def test_entities(self):
# Assert named entity recognizer.
f = StringIO("Schrödinger's cat PERS")
v = text.Entities(path=f)
self.assertEqual(v.apply(
[["Schrödinger's", "NNP"], ["cat", "NN"]]),
[["Schrödinger's", "NNP-PERS"], ["cat", "NNP-PERS"]])
print("pattern.text.Entities")
#---------------------------------------------------------------------------------------------------
class TestParser(unittest.TestCase):
def setUp(self):
pass
def test_stringio(self):
# Assert loading data from file-like strings.
p = text.Parser(
lexicon = {"to": "TO", "saw": "VBD"},
morphology = StringIO("NN s fhassuf 1 NNS x"),
context = StringIO("VBD VB PREVTAG TO"))
self.assertEqual(p.parse("cats"), "cats/NNS/B-NP/O")
self.assertEqual(p.parse("to saw"), "to/TO/B-VP/O saw/VB/I-VP/O")
def test_find_keywords(self):
# Assert the intrinsic keyword extraction algorithm.
p = text.Parser()
p.lexicon["the"] = "DT"
p.lexicon["cat"] = "NN"
p.lexicon["dog"] = "NN"
v1 = p.find_keywords("the cat")
v2 = p.find_keywords("cat. cat. dog.")
v3 = p.find_keywords("cat. dog. dog.")
v4 = p.find_keywords("the. cat. dog.", frequency={"cat": 1.0, "dog": 0.0})
self.assertEqual(v1, ["cat"])
self.assertEqual(v2, ["cat", "dog"])
self.assertEqual(v3, ["cat", "dog"])
self.assertEqual(v4, ["dog", "cat"])
print("pattern.text.Parser.find_keywords()")
def test_find_tokens(self):
# Assert the default tokenizer and its optional parameters.
p = text.Parser()
v1 = p.find_tokens("Schrödinger's cat is alive!", punctuation="", replace={})
v2 = p.find_tokens("Schrödinger's cat is dead!", punctuation="!", replace={"'s": " 's"})
v3 = p.find_tokens("etc.", abbreviations=set())
v4 = p.find_tokens("etc.", abbreviations=set(("etc.",)))
self.assertEqual(v1[0], "Schrödinger's cat is alive!")
self.assertEqual(v2[0], "Schrödinger 's cat is dead !")
self.assertEqual(v3[0], "etc .")
self.assertEqual(v4[0], "etc.")
print("pattern.text.Parser.find_tokens()")
def test_find_tags(self):
# Assert the default part-of-speech tagger and its optional parameters.
p = text.Parser()
v1 = p.find_tags(["Schrödinger", "cat", "1.0"], lexicon={}, default=("NN?", "NNP?", "CD?"))
v2 = p.find_tags(["Schrödinger", "cat", "1.0"], lexicon={"1.0": "CD?"})
v3 = p.find_tags(["Schrödinger", "cat", "1.0"], map=lambda token, tag: (token, tag + "!"))
v4 = p.find_tags(["observer", "observable"], language="fr")
v5 = p.find_tags(["observer", "observable"], language="en")
self.assertEqual(v1, [["Schr\xf6dinger", "NNP?"], ["cat", "NN?"], ["1.0", "CD?"]])
self.assertEqual(v2, [["Schr\xf6dinger", "NNP"], ["cat", "NN"], ["1.0", "CD?"]])
self.assertEqual(v3, [["Schr\xf6dinger", "NNP!"], ["cat", "NN!"], ["1.0", "CD!"]])
self.assertEqual(v4, [["observer", "NN"], ["observable", "NN"]])
self.assertEqual(v5, [["observer", "NN"], ["observable", "JJ"]])
print("pattern.text.Parser.find_tags()")
def test_find_chunks(self):
# Assert the default phrase chunker and its optional parameters.
p = text.Parser()
v1 = p.find_chunks([["", "DT"], ["", "JJ"], ["", "NN"]], language="en")
v2 = p.find_chunks([["", "DT"], ["", "JJ"], ["", "NN"]], language="es")
v3 = p.find_chunks([["", "DT"], ["", "NN"], ["", "JJ"]], language="en")
v4 = p.find_chunks([["", "DT"], ["", "NN"], ["", "JJ"]], language="es")
self.assertEqual(v1, [["", "DT", "B-NP", "O"], ["", "JJ", "I-NP", "O"], ["", "NN", "I-NP", "O"]])
self.assertEqual(v2, [["", "DT", "B-NP", "O"], ["", "JJ", "I-NP", "O"], ["", "NN", "I-NP", "O"]])
self.assertEqual(v3, [["", "DT", "B-NP", "O"], ["", "NN", "I-NP", "O"], ["", "JJ", "B-ADJP", "O"]])
self.assertEqual(v4, [["", "DT", "B-NP", "O"], ["", "NN", "I-NP", "O"], ["", "JJ", "I-NP", "O"]])
print("pattern.text.Parser.find_chunks()")
#---------------------------------------------------------------------------------------------------
class TestSentiment(unittest.TestCase):
def setUp(self):
pass
def test_dict(self):
# Assert weighted average polarity and subjectivity for dictionary.
s = text.Sentiment()
v = {":-(": 4, ":-)": 1}
self.assertEqual(s(v)[0], -0.5)
self.assertEqual(s(v)[1], +1.0)
self.assertEqual(s(v).assessments[0], ([":-("], -0.75, 1.0, "mood"))
self.assertEqual(s(v).assessments[1], ([":-)"], +0.50, 1.0, "mood"))
print("pattern.text.Sentiment.assessments")
def test_bag_of_words(self):
# Assert weighted average polarity and subjectivity for bag-of-words with weighted features.
from pattern.vector import BagOfWords # Alias for pattern.vector.Document.
s = text.Sentiment()
v = BagOfWords({":-(": 4, ":-)": 1})
self.assertEqual(s(v)[0], -0.5)
self.assertEqual(s(v)[1], +1.0)
self.assertEqual(s(v).assessments[0], ([":-("], -0.75, 1.0, "mood"))
self.assertEqual(s(v).assessments[1], ([":-)"], +0.50, 1.0, "mood"))
def test_annotate(self):
# Assert custom annotations.
s = text.Sentiment()
s.annotate("inconceivable", polarity=0.9, subjectivity=0.9)
v = "inconceivable"
self.assertEqual(s(v)[0], +0.9)
self.assertEqual(s(v)[1], +0.9)
#---------------------------------------------------------------------------------------------------
class TestMultilingual(unittest.TestCase):
def setUp(self):
pass
def test_language(self):
# Assert language recognition.
self.assertEqual(text.language("the cat sat on the mat")[0], "en")
self.assertEqual(text.language("de kat zat op de mat")[0], "nl")
self.assertEqual(text.language("le chat s'était assis sur le tapis")[0], "fr")
print("pattern.text.language()")
def test_deflood(self):
# Assert flooding removal.
self.assertEqual(text.deflood("NIIICE!!!", n=1), "NICE!")
self.assertEqual(text.deflood("NIIICE!!!", n=2), "NIICE!!")
print("pattern.text.deflood()")
#---------------------------------------------------------------------------------------------------
def suite():
suite = unittest.TestSuite()
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestLexicon))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestFrequency))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestModel))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestMorphology))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestContext))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestEntities))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSentiment))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestMultilingual))
return suite
if __name__ == "__main__":
result = unittest.TextTestRunner(verbosity=1).run(suite())
sys.exit(not result.wasSuccessful())