You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1142 lines
48 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import unittest
import random
import subprocess
from pattern import text
from pattern import en
from io import open
try:
PATH = os.path.dirname(os.path.realpath(__file__))
except:
PATH = ""
#---------------------------------------------------------------------------------------------------
class TestInflection(unittest.TestCase):
def setUp(self):
pass
def test_indefinite_article(self):
# Assert "a" or "an".
for article, word in (
("an", "hour"),
("an", "FBI"),
("a", "bear"),
("a", "one-liner"),
("a", "European"),
("a", "university"),
("a", "uterus"),
("an", "owl"),
("an", "yclept"),
("a", "year")):
self.assertEqual(en.article(word, function=en.INDEFINITE), article)
self.assertEqual(en.inflect.article("heir", function=en.DEFINITE), "the")
self.assertEqual(en.inflect.referenced("ewe"), "a ewe")
print("pattern.en.inflect.article()")
def test_pluralize(self):
# Assert "octopodes" for classical plural of "octopus".
# Assert "octopuses" for modern plural.
self.assertEqual("octopodes", en.inflect.pluralize("octopus", classical=True))
self.assertEqual("octopuses", en.inflect.pluralize("octopus", classical=False))
# Assert the accuracy of the pluralization algorithm.
from pattern.db import Datasheet
i, n = 0, 0
for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")):
if en.inflect.pluralize(sg) == pl:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.95)
print("pattern.en.inflect.pluralize()")
def test_singularize(self):
# Assert the accuracy of the singularization algorithm.
from pattern.db import Datasheet
i, n = 0, 0
for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")):
if en.inflect.singularize(pl) == sg:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.95)
print("pattern.en.inflect.singularize()")
def test_find_lemma(self):
# Assert the accuracy of the verb lemmatization algorithm.
# Note: the accuracy is higher (95%) when measured on CELEX word forms
# (probably because en.verbs has high percentage irregular verbs).
i, n = 0, 0
for v1, v2 in en.inflect.verbs.inflections.items():
if en.inflect.verbs.find_lemma(v1) == v2:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.90)
print("pattern.en.inflect.verbs.find_lemma()")
def test_find_lexeme(self):
# Assert the accuracy of the verb conjugation algorithm.
i, n = 0, 0
for v, lexeme1 in en.inflect.verbs.infinitives.items():
lexeme2 = en.inflect.verbs.find_lexeme(v)
for j in range(len(lexeme2)):
if lexeme1[j] == lexeme2[j] or \
lexeme1[j] == "" and \
lexeme1[j > 5 and 10 or 0] == lexeme2[j]:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.90)
print("pattern.en.inflect.verbs.find_lexeme()")
def test_conjugate(self):
# Assert different tenses with different conjugations.
for (v1, v2, tense) in (
("be", "be", en.INFINITIVE),
("be", "am", (en.PRESENT, 1, en.SINGULAR)),
("be", "are", (en.PRESENT, 2, en.SINGULAR)),
("be", "is", (en.PRESENT, 3, en.SINGULAR)),
("be", "are", (en.PRESENT, 0, en.PLURAL)),
("be", "being", (en.PRESENT + en.PARTICIPLE,)),
("be", "was", (en.PAST, 1, en.SINGULAR)),
("be", "were", (en.PAST, 2, en.SINGULAR)),
("be", "was", (en.PAST, 3, en.SINGULAR)),
("be", "were", (en.PAST, 0, en.PLURAL)),
("be", "were", (en.PAST, 0, None)),
("be", "been", (en.PAST + en.PARTICIPLE,)),
("be", "am", "1sg"),
("be", "are", "2sg"),
("be", "is", "3sg"),
("be", "are", "1pl"),
("be", "are", "2pl"),
("be", "are", "3pl"),
("be", "are", "pl"),
("be", "being", "part"),
("be", "was", "1sgp"),
("be", "were", "2sgp"),
("be", "was", "3sgp"),
("be", "were", "1ppl"),
("be", "were", "2ppl"),
("be", "were", "3ppl"),
("be", "were", "p"),
("be", "were", "ppl"),
("be", "been", "ppart"),
("be", "am not", "1sg-"),
("be", "aren't", "2sg-"),
("be", "isn't", "3sg-"),
("be", "aren't", "1pl-"),
("be", "aren't", "2pl-"),
("be", "aren't", "3pl-"),
("be", "aren't", "pl-"),
("be", "wasn't", "1sgp-"),
("be", "weren't", "2sgp-"),
("be", "wasn't", "3sgp-"),
("be", "weren't", "1ppl-"),
("be", "weren't", "2ppl-"),
("be", "weren't", "3ppl-"),
("be", "weren't", "ppl-"),
("had", "have", "inf"),
("had", "have", "1sg"),
("had", "have", "2sg"),
("had", "has", "3sg"),
("had", "have", "pl"),
("had", "having", "part"),
("has", "had", "1sgp"),
("has", "had", "2sgp"),
("has", "had", "3sgp"),
("has", "had", "ppl"),
("has", "had", "p"),
("has", "had", "ppart"),
("will", "will", "1sg"),
("will", "will", "2sg"),
("will", "will", "3sg"),
("will", "will", "1pl"),
("imaginerify", "imaginerifying", "part"),
("imaginerify", "imaginerified", "3sgp"),
("imaginerify", None, "1sg-")):
self.assertEqual(en.inflect.conjugate(v1, tense), v2)
print("pattern.en.inflect.conjugate()")
def test_lemma(self):
# Assert the infinitive of "weren't".
v = en.inflect.lemma("weren't")
self.assertEqual(v, "be")
print("pattern.en.inflect.lemma()")
def test_lexeme(self):
# Assert all inflections of "be".
v = en.inflect.lexeme("be")
self.assertEqual(v, [
"be", "am", "are", "is", "being",
"was", "were", "been",
"am not", "aren't", "isn't", "wasn't", "weren't"
])
v = en.inflect.lexeme("imaginerify")
self.assertEqual(v, [
"imaginerify", "imaginerifies", "imaginerifying", "imaginerified"
])
print("pattern.en.inflect.lexeme()")
def test_tenses(self):
# Assert tense recognition.
self.assertTrue((en.inflect.PRESENT, 1, en.inflect.SINGULAR) in en.inflect.tenses("am"))
self.assertTrue("1sg" in en.inflect.tenses("am"))
self.assertTrue("1sg" in en.inflect.tenses("will"))
self.assertTrue("2sg-" in en.inflect.tenses("won't"))
self.assertTrue("part" in en.inflect.tenses("imaginarifying"))
print("pattern.en.inflect.tenses()")
def test_comparative(self):
# Assert "nice" => "nicer".
self.assertEqual(en.inflect.comparative("nice"), "nicer")
print("pattern.en.inflect.comparative()")
def test_superlative(self):
# Assert "nice" => "nicest"
self.assertEqual(en.inflect.superlative("nice"), "nicest")
# Assert "important" => "most important"
self.assertEqual(en.inflect.superlative("important"), "most important")
print("pattern.en.inflect.superlative()")
#---------------------------------------------------------------------------------------------------
class TestQuantification(unittest.TestCase):
def setUp(self):
pass
def test_extract_leading_zeros(self):
# Assert "zero zero one" => ("one", 2).
from pattern.text.en.inflect_quantify import zshift
v = zshift("zero zero one")
self.assertEqual(v, ("one", 2))
v = zshift("0 0 one")
self.assertEqual(v, ("one", 2))
print("pattern.en.quantify._extract_leading_zeros()")
def test_numerals(self):
# Assert number to numerals.
for x, s in (
( 1.5, "one point five"),
( 15, "fifteen"),
( 150, "one hundred and fifty"),
( 151, "one hundred and fifty-one"),
( 1510, "one thousand five hundred and ten"),
( 15101, "fifteen thousand one hundred and one"),
( 150101, "one hundred and fifty thousand one hundred and one"),
(1500101, "one million, five hundred thousand one hundred and one")):
self.assertEqual(en.numerals(x), s)
print("pattern.en.numerals()")
def test_number(self):
# Assert numeric string = actual number (after rounding).
for i in range(100):
x = random.random()
y = en.number(en.numerals(x, round=10))
self.assertAlmostEqual(x, y, places=10)
print("pattern.en.number()")
def test_quantify(self):
# Assert quantification algorithm.
for a, s in (
( 2 * ["carrot"], "a pair of carrots"),
( 4 * ["carrot"], "several carrots"),
( 9 * ["carrot"], "a number of carrots"),
( 19 * ["carrot"], "a score of carrots"),
( 23 * ["carrot"], "dozens of carrots"),
( 201 * ["carrot"], "hundreds of carrots"),
(1001 * ["carrot"], "thousands of carrots"),
({"carrot": 4, "parrot": 2}, "several carrots and a pair of parrots")):
self.assertEqual(en.quantify(a), s)
print("pattern.en.quantify()")
def test_reflect(self):
self.assertEqual(en.reflect(""), "a string")
self.assertEqual(en.reflect(["", "", ""]), "several strings")
self.assertEqual(en.reflect(en.reflect), "a function")
print("pattern.en.reflect()")
#---------------------------------------------------------------------------------------------------
class TestSpelling(unittest.TestCase):
def test_spelling(self):
# Assert case-sensitivity + numbers.
for a, b in (
( ".", "." ),
( "?", "?" ),
( "!", "!" ),
( "I", "I" ),
( "a", "a" ),
( "42", "42" ),
("3.14", "3.14"),
( "The", "The" ),
( "the", "the" )):
self.assertEqual(en.suggest(a)[0][0], b)
# Assert spelling suggestion accuracy.
# Note: simply training on more text will not improve accuracy.
i = j = 0.0
from pattern.db import Datasheet
for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
for w in wrong.split(" "):
if en.suggest(w)[0][0] == correct:
i += 1
else:
j += 1
self.assertTrue(i / (i + j) > 0.70)
print("pattern.en.suggest()")
#---------------------------------------------------------------------------------------------------
class TestParser(unittest.TestCase):
def setUp(self):
pass
def test_tokenize(self):
# Assert list with two sentences.
# The tokenizer should at least handle common abbreviations and punctuation.
v = en.tokenize("The cat is eating (e.g., a fish). Yum!")
self.assertEqual(v, ["The cat is eating ( e.g. , a fish ) .", "Yum !"])
print("pattern.en.tokenize()")
def _test_morphological_rules(self, function=en.parser.morphology.apply):
""" For each word in WordNet that is not in Brill's lexicon,
test if the given tagger((word, "NN")) yields an improved (word, tag).
Returns the relative scores for nouns, verbs, adjectives and adverbs.
"""
scores = []
for tag, lexicon in (
("NN", en.wordnet.NOUNS),
("VB", en.wordnet.VERBS),
("JJ", en.wordnet.ADJECTIVES),
("RB", en.wordnet.ADVERBS)):
i, n = 0, 0
for word in lexicon():
word = word.replace("_", " ")
if word not in en.lexicon:
if function([word, "NN"])[1].startswith(tag):
i += 1
n += 1
scores.append(float(i) / n)
return scores
def test_default_suffix_rules(self):
# Assert part-of-speech tag for unknown tokens.
for a, b in (
(["eating", "NN"], ["eating", "VBG"]),
(["tigers", "NN"], ["tigers", "NNS"]),
(["really", "NN"], ["really", "RB"]),
(["foolish", "NN"], ["foolish", "JJ"])):
self.assertEqual(text._suffix_rules(a), b)
# Test with words in WordNet that are not in Brill's lexicon.
# Given are the scores for detection of nouns, verbs, adjectives and adverbs.
# The baseline should increase (not decrease) when the algorithm is modified.
v = self._test_morphological_rules(function=text._suffix_rules)
self.assertTrue(v[0] > 0.91) # NN
self.assertTrue(v[1] > 0.23) # VB
self.assertTrue(v[2] > 0.38) # JJ
self.assertTrue(v[3] > 0.60) # RB
print("pattern.text._suffix_rules()")
def test_apply_morphological_rules(self):
# Assert part-of-speech tag for unknown tokens (Brill's lexical rules).
v = self._test_morphological_rules(function=en.parser.morphology.apply)
self.assertTrue(v[0] > 0.85) # NN
self.assertTrue(v[1] > 0.19) # VB
self.assertTrue(v[2] > 0.65) # JJ
self.assertTrue(v[3] > 0.59) # RB
print("pattern.en.parser.morphology.apply()")
def test_apply_context_rules(self):
# Assert part-of-speech tags based on word context.
for a, b in ( # Rule:
([["", "JJ"], ["", "JJ"], ["", ","]], [["", "JJ"], ["", "NN"], ["", ","]]), # SURROUNDTAG
([["", "NNP"], ["", "RB"]], [["", "NNP"], ["", "NNP"]]), # PREVTAG
([["", "NN"], ["", "PRP$"]], [["", "VB"], ["", "PRP$"]]), # NEXTTAG
([["phone", ""], ["", "VBZ"]], [["phone", ""], ["", "NNS"]]), # PREVWD
([["", "VB"], ["countries", ""]], [["", "JJ"], ["countries", ""]]), # NEXTWD
([["close", "VB"], ["to", ""]], [["close", "RB"], ["to", ""]]), # RBIGRAM
([["very", ""], ["much", "JJ"]], [["very", ""], ["much", "RB"]]), # LBIGRAM
([["such", "JJ"], ["as", "DT"]], [["such", "JJ"], ["as", "IN"]]), # WDNEXTWD
([["be", "VB"]], [["be", "VB"]])): # CURWD
self.assertEqual(en.parser.context.apply(a), b)
print("pattern.en.parser.context.apply()")
def test_find_tags(self):
# Assert part-of-speech-tag annotation.
v = en.parser.find_tags(["black", "cat"])
self.assertEqual(v, [["black", "JJ"], ["cat", "NN"]])
self.assertEqual(en.parser.find_tags(["felix"])[0][1], "NN")
self.assertEqual(en.parser.find_tags(["Felix"])[0][1], "NNP")
print("pattern.en.parser.find_tags()")
def test_find_chunks(self):
# Assert chunk tag annotation.
v = en.parser.find_chunks([["black", "JJ"], ["cat", "NN"]])
self.assertEqual(v, [["black", "JJ", "B-NP", "O"], ["cat", "NN", "I-NP", "O"]])
# Assert the accuracy of the chunker.
# For example, in "The very black cat must be really meowing really loud in the yard.":
# - "The very black" (NP)
# - "must be really meowing" (VP)
# - "really loud" (ADJP)
# - "in" (PP)
# - "the yard" (NP)
v = en.parser.find_chunks([
["", "DT"], ["", "RB"], ["", "JJ"], ["", "NN"],
["", "MD"], ["", "RB"], ["", "VBZ"], ["", "VBG"],
["", "RB"], ["", "JJ"],
["", "IN"],
["", "CD"], ["", "NNS"]
])
self.assertEqual(v, [
["", "DT", "B-NP", "O"], ["", "RB", "I-NP", "O"], ["", "JJ", "I-NP", "O"], ["", "NN", "I-NP", "O"],
["", "MD", "B-VP", "O"], ["", "RB", "I-VP", "O"], ["", "VBZ", "I-VP", "O"], ["", "VBG", "I-VP", "O"],
["", "RB", "B-ADJP", "O"], ["", "JJ", "I-ADJP", "O"],
["", "IN", "B-PP", "B-PNP"],
["", "CD", "B-NP", "I-PNP"], ["", "NNS", "I-NP", "I-PNP"]])
# Assert commas inside chunks.
# - "the big, black cat"
v = en.parser.find_chunks([
["", "DT"], ["", "JJ"], ["", ","], ["", "JJ"], ["", "NN"]
])
self.assertEqual(v, [
["", "DT", "B-NP", "O"],
["", "JJ", "I-NP", "O"],
["", ",", "I-NP", "O"],
["", "JJ", "I-NP", "O"],
["", "NN", "I-NP", "O"]
])
# - "big, black and furry"
v = en.parser.find_chunks([
["", "JJ"], ["", ","], ["", "JJ"], ["", "CC"], ["", "JJ"]
])
self.assertEqual(v, [
["", "JJ", "B-ADJP", "O"],
["", ",", "I-ADJP", "O"],
["", "JJ", "I-ADJP", "O"],
["", "CC", "I-ADJP", "O"],
["", "JJ", "I-ADJP", "O"]
])
# - big, and very black (= two chunks "big" and "very black")
v = en.parser.find_chunks([
["", "JJ"], ["", ","], ["", "CC"], ["", "RB"], ["", "JJ"]
])
self.assertEqual(v, [
["", "JJ", "B-ADJP", "O"],
["", ",", "O", "O"],
["", "CC", "O", "O"],
["", "RB", "B-ADJP", "O"],
["", "JJ", "I-ADJP", "O"]
])
# Assert cases for which we have written special rules.
# - "perhaps you" (ADVP + NP)
v = en.parser.find_chunks([["", "RB"], ["", "PRP"]])
self.assertEqual(v, [["", "RB", "B-ADVP", "O"], ["", "PRP", "B-NP", "O"]])
# - "very nice cats" (NP)
v = en.parser.find_chunks([["", "RB"], ["", "JJ"], ["", "PRP"]])
self.assertEqual(v, [["", "RB", "B-NP", "O"], ["", "JJ", "I-NP", "O"], ["", "PRP", "I-NP", "O"]])
print("pattern.en.parser.find_chunks()")
def test_find_labels(self):
# Assert relation tag annotation (SBJ/OBJ).
v = en.parser.find_labels([
["", "", "NP"], ["", "", "NP"],
["", "", "VP"], ["", "", "VP"],
["", "", "NP"]])
self.assertEqual(v, [
["", "", "NP", "NP-SBJ-1"], ["", "", "NP", "NP-SBJ-1"],
["", "", "VP", "VP-1"], ["", "", "VP", "VP-1"],
["", "", "NP", "NP-OBJ-1"]])
print("pattern.en.parser.find_labels()")
def test_find_prepositions(self):
# Assert preposition tag annotation (PP + NP).
v = en.parser.find_prepositions([
["", "", "NP"],
["", "", "VP"],
["", "", "PP"],
["", "", "NP"],
["", "", "NP"], ])
self.assertEqual(v, [
["", "", "NP", "O"],
["", "", "VP", "O"],
["", "", "PP", "B-PNP"],
["", "", "NP", "I-PNP"],
["", "", "NP", "I-PNP"]])
# Assert PNP's with consecutive PP's.
v = en.parse("The cat was looking at me from up on the roof with interest.", prepositions=True)
self.assertEqual(v,
"The/DT/B-NP/O cat/NN/I-NP/O " \
"was/VBD/B-VP/O looking/VBG/I-VP/O " \
"at/IN/B-PP/B-PNP me/PRP/B-NP/I-PNP " \
"from/IN/B-PP/B-PNP up/IN/I-PP/I-PNP on/IN/I-PP/I-PNP the/DT/B-NP/I-PNP roof/NN/I-NP/I-PNP " \
"with/IN/B-PP/B-PNP interest/NN/B-NP/I-PNP " \
"././O/O"
)
print("pattern.en.parser.find_prepositions()")
def test_find_lemmata(self):
# Assert lemmata for nouns and verbs.
v = en.parser.find_lemmata([["cats", "NNS"], ["wearing", "VBG"], ["hats", "NNS"]])
self.assertEqual(v, [
["cats", "NNS", "cat"],
["wearing", "VBG", "wear"],
["hats", "NNS", "hat"]])
print("pattern.en.parser.find_lemmata()")
def test_named_entity_recognition(self):
# Assert named entities.
v = en.parser.parse("Arnold Schwarzenegger is cool.", chunks=False)
self.assertEqual(v,
"Arnold/NNP-PERS Schwarzenegger/NNP-PERS is/VBZ cool/JJ ./."
)
print("pattern.en.parser.entities.apply()")
def test_parse(self):
# Assert parsed output with Penn Treebank II tags (slash-formatted).
# 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase.
v = en.parser.parse("The black cat sat on the mat.")
self.assertEqual(v,
"The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \
"sat/VBD/B-VP/O " + \
"on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
)
# 2) "the black cat" is the subject, "a fish" is the object.
v = en.parser.parse("The black cat is eating a fish.", relations=True)
self.assertEqual(v,
"The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \
"is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \
"a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O"
)
# 3) "chasing" and "mice" lemmata are "chase" and "mouse".
v = en.parser.parse("The black cat is chasing mice.", lemmata=True)
self.assertEqual(v,
"The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \
"is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \
"mice/NNS/B-NP/O/mouse ././O/O/."
)
# 4) Assert str.
self.assertTrue(isinstance(v, str))
# 5) Assert str for faulty input (bytestring with unicode characters).
self.assertTrue(isinstance(en.parse("ø ü"), str))
self.assertTrue(isinstance(en.parse("ø ü", tokenize=True, tags=False, chunks=False), str))
self.assertTrue(isinstance(en.parse("ø ü", tokenize=False, tags=False, chunks=False), str))
self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), str))
# 6) Assert optional parameters (i.e., setting all to False).
self.assertEqual(en.parse("ø ü.", tokenize=True, tags=False, chunks=False), "ø ü .")
self.assertEqual(en.parse("ø ü.", tokenize=False, tags=False, chunks=False), "ø ü.")
# 7) Assert the accuracy of the English tagger.
i, n = 0, 0
for corpus, a in (("tagged-en-wsj.txt", (0.968, 0.945)), ("tagged-en-oanc.txt", (0.929, 0.932))):
for sentence in open(os.path.join(PATH, "corpora", corpus)).readlines():
sentence = sentence.strip()
s1 = [w.split("/") for w in sentence.split(" ")]
s2 = [[w for w, pos in s1]]
s2 = en.parse(s2, tokenize=False)
s2 = [w.split("/") for w in s2.split(" ")]
for j in range(len(s1)):
if s1[j][1] == s2[j][1].split("-")[0]:
i += 1
n += 1
#print(corpus, float(i) / n)
self.assertTrue(float(i) / n > (en.parser.model and a[0] or a[1]))
print("pattern.en.parse()")
def test_tagged_string(self):
# Assert splitable TaggedString with language and tags properties.
v = en.parser.parse("The black cat sat on the mat.", relations=True, lemmata=True)
self.assertEqual(v.language, "en")
self.assertEqual(v.tags,
["word", "part-of-speech", "chunk", "preposition", "relation", "lemma"])
self.assertEqual(v.split(text.TOKENS)[0][0],
["The", "DT", "B-NP", "O", "NP-SBJ-1", "the"])
print("pattern.en.parse().split()")
def test_parsetree(self):
# Assert parsetree(s) == Text.
v = en.parsetree("The cat purs.")
self.assertTrue(isinstance(v, en.Text))
print("pattern.en.parsetree()")
def test_split(self):
# Assert split(parse(s)) == Text.
v = en.split(en.parse("The cat purs."))
self.assertTrue(isinstance(v, en.Text))
print("pattern.en.split()")
def test_tag(self):
# Assert [("black", "JJ"), ("cats", "NNS")].
v = en.tag("black cats")
self.assertEqual(v, [("black", "JJ"), ("cats", "NNS")])
v = en.tag("")
self.assertEqual(v, [])
print("pattern.en.tag()")
def test_ngrams(self):
# Assert n-grams with and without punctuation marks / sentence marks.
s = "The cat is napping."
v1 = en.ngrams(s, n=2)
v2 = en.ngrams(s, n=3, punctuation=en.PUNCTUATION.strip("."))
self.assertEqual(v1, [("The", "cat"), ("cat", "is"), ("is", "napping")])
self.assertEqual(v2, [("The", "cat", "is"), ("cat", "is", "napping"), ("is", "napping", ".")])
s = "The cat purrs. The dog barks."
v1 = en.ngrams(s, n=2)
v2 = en.ngrams(s, n=2, continuous=True)
self.assertEqual(v1, [("The", "cat"), ("cat", "purrs"), ("The", "dog"), ("dog", "barks")])
self.assertEqual(v2, [("The", "cat"), ("cat", "purrs"), ("purrs", "The"), ("The", "dog"), ("dog", "barks")])
print("pattern.en.ngrams()")
def test_command_line(self):
# Assert parsed output from the command-line (example from the documentation).
p = ["python", "-m", "pattern.en", "-s", "Nice cat.", "-OTCRL"]
p = subprocess.Popen(p, stdout=subprocess.PIPE)
p.wait()
v = p.stdout.read().decode('utf-8')
v = v.strip()
self.assertEqual(v, "Nice/JJ/B-NP/O/O/nice cat/NN/I-NP/O/O/cat ././O/O/O/.")
print("python -m pattern.en")
#---------------------------------------------------------------------------------------------------
class TestParseTree(unittest.TestCase):
def setUp(self):
# Parse sentences to test on.
# Creating a Text creates Sentence, Chunk, PNP and Word.
# Creating a Sentence tests Sentence.append() and Sentence.parse_token().
self.text = "I'm eating pizza with a fork. What a tasty pizza!"
self.text = en.Text(en.parse(self.text, relations=True, lemmata=True))
def test_copy(self):
# Assert deepcopy of Text, Sentence, Chunk, PNP and Word.
self.text = self.text.copy()
print("pattern.en.Text.copy()")
def test_xml(self):
# Assert XML export and import.
self.text = en.Text.from_xml(self.text.xml)
print("pattern.en.Text.xml")
print("pattern.en.Text.from_xml()")
def test_text(self):
# Assert Text.
self.assertEqual(self.text.sentences[0].string, "I 'm eating pizza with a fork .")
self.assertEqual(self.text.sentences[1].string, "What a tasty pizza !")
print("pattern.en.Text")
def test_sentence(self):
# Assert Sentence.
v = self.text[0]
self.assertTrue(v.start == 0)
self.assertTrue(v.stop == 8)
self.assertTrue(v.string == "I 'm eating pizza with a fork .")
self.assertTrue(v.subjects == [self.text[0].chunks[0]])
self.assertTrue(v.verbs == [self.text[0].chunks[1]])
self.assertTrue(v.objects == [self.text[0].chunks[2]])
self.assertTrue(v.nouns == [self.text[0].words[3], self.text[0].words[6]])
# Sentence.string must be unicode.
self.assertTrue(isinstance(v.string, str))
self.assertTrue(isinstance(str(v), str))
print("pattern.en.Sentence")
def test_sentence_constituents(self):
# Assert in-order list of Chunk, PNP and Word.
v = self.text[0].constituents(pnp=True)
self.assertEqual(v, [
self.text[0].chunks[0],
self.text[0].chunks[1],
self.text[0].chunks[2],
self.text[0].pnp[0],
self.text[0].words[7],
])
print("pattern.en.Sentence.constituents()")
def test_slice(self):
# Assert sentence slice.
v = self.text[0].slice(start=4, stop=6)
self.assertTrue(v.parent == self.text[0])
self.assertTrue(v.string == "with a")
# Assert sentence slice tag integrity.
self.assertTrue(v.words[0].type == "IN")
self.assertTrue(v.words[1].chunk is None)
print("pattern.en.Slice")
def test_chunk(self):
# Assert chunk with multiple words ("a fork").
v = self.text[0].chunks[4]
self.assertTrue(v.start == 5)
self.assertTrue(v.stop == 7)
self.assertTrue(v.string == "a fork")
self.assertTrue(v.lemmata == ["a", "fork"])
self.assertTrue(v.words == [self.text[0].words[5], self.text[0].words[6]])
self.assertTrue(v.head == self.text[0].words[6])
self.assertTrue(v.type == "NP")
self.assertTrue(v.role is None)
self.assertTrue(v.pnp is not None)
# Assert chunk that is subject/object of the sentence ("pizza").
v = self.text[0].chunks[2]
self.assertTrue(v.role == "OBJ")
self.assertTrue(v.relation == 1)
self.assertTrue(v.related == [self.text[0].chunks[0], self.text[0].chunks[1]])
self.assertTrue(v.subject == self.text[0].chunks[0])
self.assertTrue(v.verb == self.text[0].chunks[1])
self.assertTrue(v.object is None)
# Assert chunk traversal.
self.assertEqual(v.nearest("VP"), self.text[0].chunks[1])
self.assertEqual(v.previous(), self.text[0].chunks[1])
self.assertEqual(v.next(), self.text[0].chunks[3])
print("pattern.en.Chunk")
def test_chunk_conjunctions(self):
# Assert list of conjunct/disjunct chunks ("black cat" AND "white cat").
v = en.Sentence(en.parse("black cat and white cat"))
self.assertEqual(v.chunk[0].conjunctions, [(v.chunk[1], en.AND)])
print("pattern.en.Chunk.conjunctions()")
def test_chunk_modifiers(self):
# Assert list of nearby adjectives and adverbs with no role, for VP.
v = en.Sentence(en.parse("Perhaps you should go."))
self.assertEqual(v.chunk[2].modifiers, [v.chunk[0]]) # should <=> perhaps
print("pattern.en.Chunk.modifiers")
def test_pnp(self):
# Assert PNP chunk ("with a fork").
v = self.text[0].pnp[0]
self.assertTrue(v.string == "with a fork")
self.assertTrue(v.chunks == [self.text[0].chunks[3], self.text[0].chunks[4]])
self.assertTrue(v.pp == self.text[0].chunks[3])
print("pattern.en.PNP")
def test_word(self):
# Assert word tags ("fork" => NN).
v = self.text[0].words[6]
self.assertTrue(v.index == 6)
self.assertTrue(v.string == "fork")
self.assertTrue(v.lemma == "fork")
self.assertTrue(v.type == "NN")
self.assertTrue(v.chunk == self.text[0].chunks[4])
self.assertTrue(v.pnp is not None)
for i, tags in enumerate([
["I", "PRP", "B-NP", "O", "NP-SBJ-1", "i"],
["'m", "VBP", "B-VP", "O", "VP-1", "be"],
["eating", "VBG", "I-VP", "O", "VP-1", "eat"],
["pizza", "NN", "B-NP", "O", "NP-OBJ-1", "pizza"],
["with", "IN", "B-PP", "B-PNP", "O", "with"],
["a", "DT", "B-NP", "I-PNP", "O", "a"],
["fork", "NN", "I-NP", "I-PNP", "O", "fork"],
[".", ".", "O", "O", "O", "."]]):
self.assertEqual(self.text[0].words[i].tags, tags)
print("pattern.en.Word")
def test_word_custom_tags(self):
# Assert word custom tags ("word/part-of-speech/.../some-custom-tag").
s = en.Sentence("onion/NN/FOOD", token=[en.WORD, en.POS, "semantic_type"])
v = s.words[0]
self.assertEqual(v.semantic_type, "FOOD")
self.assertEqual(v.custom_tags["semantic_type"], "FOOD")
self.assertEqual(v.copy().custom_tags["semantic_type"], "FOOD")
# Assert addition of new custom tags.
v.custom_tags["taste"] = "pungent"
self.assertEqual(s.token, [en.WORD, en.POS, "semantic_type", "taste"])
print("pattern.en.Word.custom_tags")
def test_find(self):
# Assert first item for which given function is True.
v = text.tree.find(lambda x: x > 10, [1, 2, 3, 11, 12])
self.assertEqual(v, 11)
print("pattern.text.tree.find()")
def test_zip(self):
# Assert list of zipped tuples, using default to balance uneven lists.
v = text.tree.zip([1, 2, 3], [4, 5, 6, 7], default=0)
self.assertEqual(v, [(1, 4), (2, 5), (3, 6), (0, 7)])
print("pattern.text.tree.zip()")
def test_unzip(self):
v = text.tree.unzip(1, [(1, 4), (2, 5), (3, 6)])
self.assertEqual(v, [4, 5, 6])
print("pattern.text.tree.unzip()")
def test_unique(self):
# Assert list copy with unique items.
v = text.tree.unique([1, 1, 1])
self.assertEqual(len(v), 1)
self.assertEqual(v[0], 1)
print("pattern.text.tree.unique()")
def test_map(self):
# Assert dynamic Map().
v = text.tree.Map(lambda x: x + 1, [1, 2, 3])
self.assertEqual(list(v), [2, 3, 4])
self.assertEqual(v.items[0], 1)
print("pattern.text.tree.Map()")
#---------------------------------------------------------------------------------------------------
class TestModality(unittest.TestCase):
def setUp(self):
pass
def test_imperative(self):
# Assert True for sentences that are orders, commands, warnings.
from pattern.text.en.modality import imperative
for b, s in (
(True, "Do your homework!"),
(True, "Do not listen to me."),
(True, "Turn that off, will you."),
(True, "Let's help him."),
(True, "Help me!"),
(True, "You will help me."),
(False, "Do it if you think it is necessary."),
(False, "I hope you will help me."),
(False, "I can help you."),
(False, "I can help you if you let me.")):
self.assertEqual(imperative(en.Sentence(en.parse(s))), b)
print("pattern.en.modality.imperative()")
def test_conditional(self):
# Assert True for sentences that contain possible or imaginary situations.
from pattern.text.en.modality import conditional
for b, s in (
(True, "We ought to help him."),
(True, "We could help him."),
(True, "I will help you."),
(True, "I hope you will help me."),
(True, "I can help you if you let me."),
(False, "You will help me."),
(False, "I can help you.")):
self.assertEqual(conditional(en.Sentence(en.parse(s))), b)
# Assert predictive mood.
s = "I will help you."
v = conditional(en.Sentence(en.parse(s)), predictive=False)
self.assertEqual(v, False)
# Assert speculative mood.
s = "I will help you if you pay me."
v = conditional(en.Sentence(en.parse(s)), predictive=False)
self.assertEqual(v, True)
print("pattern.en.modality.conditional()")
def test_subjunctive(self):
# Assert True for sentences that contain wishes, judgments or opinions.
from pattern.text.en.modality import subjunctive
for b, s in (
(True, "I wouldn't do that if I were you."),
(True, "I wish I knew."),
(True, "I propose that you be on time."),
(True, "It is a bad idea to be late."),
(False, "I will be late.")):
self.assertEqual(subjunctive(en.Sentence(en.parse(s))), b)
print("pattern.en.modality.subjunctive()")
def test_negated(self):
# Assert True for sentences that contain "not", "n't" or "never".
for b, s in (
(True, "Not true?"),
(True, "Never true."),
(True, "Isn't true."),):
self.assertEqual(en.negated(en.Sentence(en.parse(s))), b)
print("pattern.en.negated()")
def test_mood(self):
# Assert imperative mood.
v = en.mood(en.Sentence(en.parse("Do your homework!")))
self.assertEqual(v, en.IMPERATIVE)
# Assert conditional mood.
v = en.mood(en.Sentence(en.parse("We ought to help him.")))
self.assertEqual(v, en.CONDITIONAL)
# Assert subjunctive mood.
v = en.mood(en.Sentence(en.parse("I wouldn't do that if I were you.")))
self.assertEqual(v, en.SUBJUNCTIVE)
# Assert indicative mood.
v = en.mood(en.Sentence(en.parse("The weather is nice today.")))
self.assertEqual(v, en.INDICATIVE)
print("pattern.en.mood()")
def test_modality(self):
# Assert -1.0 => +1.0 representing the degree of certainty.
v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
self.assertTrue(v < 0)
v = en.modality(en.Sentence(en.parse("It will surely stop raining soon.")))
self.assertTrue(v > 0)
# Assert the accuracy of the modality algorithm.
# Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
# http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
# The baseline should increase (not decrease) when the algorithm is modified.
from pattern.db import Datasheet
from pattern.metrics import test
sentences = []
for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")):
sentence = en.parse(sentence, chunks=False, light=True)
sentence = en.Sentence(sentence)
sentences.append((sentence, int(certain) > 0))
A, P, R, F = test(lambda sentence: en.modality(sentence) > 0.5, sentences)
#print(A, P, R, F)
self.assertTrue(A > 0.69)
self.assertTrue(P > 0.72)
self.assertTrue(R > 0.63)
self.assertTrue(F > 0.68)
print("pattern.en.modality()")
#---------------------------------------------------------------------------------------------------
class TestSentiment(unittest.TestCase):
def setUp(self):
pass
def test_sentiment_avg(self):
# Assert 2.5.
from pattern.text import avg
v = avg([1, 2, 3, 4])
self.assertEqual(v, 2.5)
print("pattern.text.avg")
def test_sentiment(self):
# Assert < 0 for negative adjectives and > 0 for positive adjectives.
self.assertTrue(en.sentiment("wonderful")[0] > 0)
self.assertTrue(en.sentiment("horrible")[0] < 0)
self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
# Assert that :) and :( are recognized.
self.assertTrue(en.sentiment(":)")[0] > 0)
self.assertTrue(en.sentiment(":(")[0] < 0)
# Assert the accuracy of the sentiment analysis (for the positive class).
# Given are the scores for Pang & Lee's polarity dataset v2.0:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
# The baseline should increase (not decrease) when the algorithm is modified.
from pattern.db import Datasheet
from pattern.metrics import test
reviews = []
for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
reviews.append((review, int(score) > 0))
from time import time
t = time()
A, P, R, F = test(lambda review: en.positive(review), reviews)
#print(A, P, R, F)
self.assertTrue(A > 0.752)
self.assertTrue(P > 0.772)
self.assertTrue(R > 0.715)
self.assertTrue(F > 0.743)
# Assert the accuracy of the sentiment analysis on short text (for the positive class).
# Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
reviews = []
for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
reviews.append((review, int(score) > 0))
A, P, R, F = test(lambda review: en.positive(review), reviews)
#print(A, P, R, F)
self.assertTrue(A > 0.654)
self.assertTrue(P > 0.660)
self.assertTrue(R > 0.636)
self.assertTrue(F > 0.648)
print("pattern.en.sentiment()")
def test_sentiment_twitter(self):
sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv")
if os.path.exists(sanders):
# Assert the accuracy of the sentiment analysis on tweets.
# Given are the scores for Sanders Twitter Sentiment Corpus:
# http://www.sananalytics.com/lab/twitter-sentiment/
# Positive + neutral is taken as polarity >= 0.0,
# Negative is taken as polarity < 0.0.
# Since there are a lot of neutral cases,
# and the algorithm predicts 0.0 by default (i.e., majority class) the results are good.
# Distinguishing negative from neutral from positive is a much harder task
from pattern.db import Datasheet
from pattern.metrics import test
reviews = []
for i, id, date, tweet, polarity, topic in Datasheet.load(sanders):
if polarity != "irrelevant":
reviews.append((tweet, polarity in ("positive", "neutral")))
A, P, R, F = test(lambda review: en.positive(review, threshold=0.0), reviews)
#print(A, P, R, F)
self.assertTrue(A > 0.824)
self.assertTrue(P > 0.879)
self.assertTrue(R > 0.911)
self.assertTrue(F > 0.895)
def test_sentiment_assessment(self):
# Assert that en.sentiment() has a fine-grained "assessments" property.
v = en.sentiment("A warm and pleasant day.").assessments
self.assertTrue(v[1][0][0] == "pleasant")
self.assertTrue(v[1][1] > 0)
print("pattern.en.sentiment().assessments")
def test_polarity(self):
# Assert that en.polarity() yields en.sentiment()[0].
s = "A great day!"
self.assertTrue(en.polarity(s) == en.sentiment(s)[0])
print("pattern.en.polarity()")
def test_subjectivity(self):
# Assert that en.subjectivity() yields en.sentiment()[1].
s = "A great day!"
self.assertTrue(en.subjectivity(s) == en.sentiment(s)[1])
print("pattern.en.subjectivity()")
def test_positive(self):
# Assert that en.positive() yields polarity >= 0.1.
s = "A great day!"
self.assertTrue(en.positive(s))
print("pattern.en.subjectivity()")
def test_sentiwordnet(self):
# Assert < 0 for negative words and > 0 for positive words.
try:
from pattern.text.en.wordnet import SentiWordNet
lexicon = SentiWordNet()
lexicon.load()
except ImportError as e:
# SentiWordNet data file is not installed in default location, stop test.
print(e)
return
self.assertTrue(lexicon["wonderful"][0] > 0)
self.assertTrue(lexicon["horrible"][0] < 0)
print("pattern.en.sentiment.SentiWordNet")
#---------------------------------------------------------------------------------------------------
class TestWordNet(unittest.TestCase):
def setUp(self):
pass
def test_normalize(self):
# Assert normalization of simple diacritics (WordNet does not store diacritics).
self.assertEqual(en.wordnet.normalize("cliché"), "cliche")
self.assertEqual(en.wordnet.normalize("façade"), "facade")
print("pattern.en.wordnet.normalize()")
def test_version(self):
print("WordNet " + en.wordnet.VERSION)
def test_synsets(self):
# Assert synsets by part-of-speech.
for word, pos in (
("cat", en.wordnet.NOUN),
("purr", en.wordnet.VERB),
("nice", en.wordnet.ADJECTIVE),
("nicely", en.wordnet.ADVERB),
("cat", "nn"),
("cat", "NNS")):
self.assertTrue(en.wordnet.synsets(word, pos) != [])
# Assert TypeError when part-of-speech is not NOUN, VERB, ADJECTIVE or ADVERB.
self.assertRaises(TypeError, en.wordnet.synsets, "cat", "unknown_pos")
print("pattern.en.wordnet.synsets()")
def test_synset(self):
v = en.wordnet.synsets("puma")[0]
# Assert Synset(id).
self.assertEqual(v, en.wordnet.Synset(v.id))
self.assertEqual(v.pos, en.wordnet.NOUN)
self.assertAlmostEqual(v.ic, 0.0, places=1)
self.assertTrue("cougar" in v.synonyms) # ["cougar", "puma", "catamount", ...]
self.assertTrue("feline" in v.gloss) # "large American feline resembling a lion"
# Assert WordNet relations.
s = en.wordnet.synsets
v = s("tree")[0]
self.assertTrue(v.hypernym in v.hypernyms())
self.assertTrue(s("woody plant")[0] in v.hypernyms())
self.assertTrue(s("entity")[0] in v.hypernyms(recursive=True))
self.assertTrue(s("beech")[0] in v.hyponyms())
self.assertTrue(s("red beech")[0] in v.hyponyms(recursive=True))
self.assertTrue(s("trunk")[0] in v.meronyms())
self.assertTrue(s("forest")[0] in v.holonyms())
# Assert Lin-similarity.
self.assertTrue(
v.similarity(s("flower")[0]) >
v.similarity(s("teapot")[0]))
print("pattern.en.wordnet.Synset")
def test_ancenstor(self):
# Assert least-common-subsumer algorithm.
v1 = en.wordnet.synsets("cat")[0]
v2 = en.wordnet.synsets("dog")[0]
self.assertTrue(en.wordnet.ancestor(v1, v2) == en.wordnet.synsets("carnivore")[0])
print("pattern.en.wordnet.ancestor()")
def test_map32(self):
# Assert sense mapping from WN 3.0 to 2.1.
self.assertEqual(en.wordnet.map32(18850, "JJ"), (19556, "JJ"))
self.assertEqual(en.wordnet.map32(1382437, "VB"), (1370230, "VB"))
print("pattern.en.wordnet.map32")
def test_sentiwordnet(self):
# Assert SentiWordNet is loaded correctly.
if en.wordnet.sentiwordnet is None:
return
try:
en.wordnet.sentiwordnet.load()
except ImportError:
return
v = en.wordnet.synsets("anguish")[0]
self.assertEqual(v.weight, (-0.625, 0.625))
v = en.wordnet.synsets("enzymology")[0]
self.assertEqual(v.weight, (0.125, 0.125))
print("pattern.en.wordnet.sentiwordnet")
#---------------------------------------------------------------------------------------------------
class TestWordlists(unittest.TestCase):
def setUp(self):
pass
def test_wordlist(self):
# Assert lazy loading Wordlist.
v = en.wordlist.STOPWORDS
self.assertTrue("the" in v)
# Assert Wordlist to dict.
v = dict.fromkeys(en.wordlist.STOPWORDS, True)
self.assertTrue("the" in v)
# Assert new Wordlist by adding other Wordlists.
v = en.wordlist.STOPWORDS + en.wordlist.ACADEMIC
self.assertTrue("the" in v)
self.assertTrue("dr." in v)
print("pattern.en.wordlist.Wordlist")
#---------------------------------------------------------------------------------------------------
def suite():
suite = unittest.TestSuite()
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestQuantification))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSpelling))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParseTree))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestModality))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSentiment))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestWordNet))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestWordlists))
return suite
if __name__ == "__main__":
result = unittest.TextTestRunner(verbosity=1).run(suite())
sys.exit(not result.wasSuccessful())