You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

258 lines
11 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import unittest
import subprocess
from pattern import de
from io import open
try:
PATH = os.path.dirname(os.path.realpath(__file__))
except:
PATH = ""
#---------------------------------------------------------------------------------------------------
class TestInflection(unittest.TestCase):
def setUp(self):
pass
def test_gender(self):
# Assert der Hund => MASCULINE
# Assert die Studentin => FEMININE
# Assert das Auto => NEUTRAL
self.assertEqual(de.gender("Hund"), de.MASCULINE)
self.assertEqual(de.gender("Studentin"), de.FEMININE)
self.assertEqual(de.gender("Auto"), de.NEUTRAL)
def test_pluralize(self):
# Assert the accuracy of the pluralization algorithm.
from pattern.db import Datasheet
i, n = 0, 0
for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
if tag == "n":
if de.pluralize(sg) == pl:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.69)
print("pattern.de.pluralize()")
def test_singularize(self):
# Assert the accuracy of the singularization algorithm.
from pattern.db import Datasheet
i, n = 0, 0
for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
if tag == "n":
if de.singularize(pl) == sg:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.82)
print("pattern.de.singularize()")
def test_attributive(self):
# Assert "groß" => "großer" (masculine, nominative), and others.
for lemma, inflected, gender, role, article in (
("groß", "großer", de.MALE, de.SUBJECT, None),
("groß", "großen", de.MALE, de.OBJECT, None),
("groß", "großem", de.MALE, de.INDIRECT, None),
("groß", "großen", de.MALE, de.PROPERTY, None),
("groß", "große", de.FEMALE, de.SUBJECT, None),
("groß", "große", de.FEMALE, de.OBJECT, None),
("groß", "großer", de.FEMALE, de.INDIRECT, None),
("groß", "großes", de.NEUTRAL, de.SUBJECT, None),
("groß", "großes", de.NEUTRAL, de.OBJECT, None),
("groß", "großen", de.MALE, de.PROPERTY, "mein"),
("groß", "großen", de.FEMALE, de.PROPERTY, "jeder"),
("groß", "großen", de.FEMALE, de.PROPERTY, "mein"),
("groß", "großen", de.PLURAL, de.INDIRECT, "jede"),
("groß", "großen", de.PLURAL, de.PROPERTY, "jeder")):
v = de.attributive(lemma, gender, role, article)
self.assertEqual(v, inflected)
print("pattern.de.attributive()")
def test_predicative(self):
# Assert the accuracy of the predicative algorithm ("großer" => "groß").
from pattern.db import Datasheet
i, n = 0, 0
for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
if tag == "a":
if de.predicative(attr) == pred:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.98)
print("pattern.de.predicative()")
def test_find_lemma(self):
# Assert the accuracy of the verb lemmatization algorithm.
# Note: the accuracy is higher (88%) when measured on CELEX word forms
# (presumably because de.inflect.verbs has high percentage irregular verbs).
i, n = 0, 0
for v1, v2 in de.inflect.verbs.inflections.items():
if de.inflect.verbs.find_lemma(v1) == v2:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.86)
print("pattern.de.inflect.verbs.find_lemma()")
def test_find_lexeme(self):
# Assert the accuracy of the verb conjugation algorithm.
i, n = 0, 0
for v, lexeme1 in de.inflect.verbs.infinitives.items():
lexeme2 = de.inflect.verbs.find_lexeme(v)
for j in range(len(lexeme2)):
if lexeme1[j] == "":
continue
if lexeme1[j] == lexeme2[j]:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.86)
print("pattern.de.inflect.verbs.find_lexeme()")
def test_conjugate(self):
# Assert different tenses with different conjugations.
for (v1, v2, tense) in (
("sein", "sein", de.INFINITIVE),
("sein", "bin", (de.PRESENT, 1, de.SINGULAR)),
("sein", "bist", (de.PRESENT, 2, de.SINGULAR)),
("sein", "ist", (de.PRESENT, 3, de.SINGULAR)),
("sein", "sind", (de.PRESENT, 1, de.PLURAL)),
("sein", "seid", (de.PRESENT, 2, de.PLURAL)),
("sein", "sind", (de.PRESENT, 3, de.PLURAL)),
("sein", "seiend", (de.PRESENT + de.PARTICIPLE)),
("sein", "war", (de.PAST, 1, de.SINGULAR)),
("sein", "warst", (de.PAST, 2, de.SINGULAR)),
("sein", "war", (de.PAST, 3, de.SINGULAR)),
("sein", "waren", (de.PAST, 1, de.PLURAL)),
("sein", "wart", (de.PAST, 2, de.PLURAL)),
("sein", "waren", (de.PAST, 3, de.PLURAL)),
("sein", "gewesen", (de.PAST + de.PARTICIPLE)),
("sein", "sei", (de.PRESENT, 2, de.SINGULAR, de.IMPERATIVE)),
("sein", "seien", (de.PRESENT, 1, de.PLURAL, de.IMPERATIVE)),
("sein", "seid", (de.PRESENT, 2, de.PLURAL, de.IMPERATIVE)),
("sein", "sei", (de.PRESENT, 1, de.SINGULAR, de.SUBJUNCTIVE)),
("sein", "seiest", (de.PRESENT, 2, de.SINGULAR, de.SUBJUNCTIVE)),
("sein", "sei", (de.PRESENT, 3, de.SINGULAR, de.SUBJUNCTIVE)),
("sein", "seien", (de.PRESENT, 1, de.PLURAL, de.SUBJUNCTIVE)),
("sein", "seiet", (de.PRESENT, 2, de.PLURAL, de.SUBJUNCTIVE)),
("sein", "seien", (de.PRESENT, 3, de.PLURAL, de.SUBJUNCTIVE)),
("sein", "wäre", (de.PAST, 1, de.SINGULAR, de.SUBJUNCTIVE)),
("sein", "wärest", (de.PAST, 2, de.SINGULAR, de.SUBJUNCTIVE)),
("sein", "wäre", (de.PAST, 3, de.SINGULAR, de.SUBJUNCTIVE)),
("sein", "wären", (de.PAST, 1, de.PLURAL, de.SUBJUNCTIVE)),
("sein", "wäret", (de.PAST, 2, de.PLURAL, de.SUBJUNCTIVE)),
("sein", "wären", (de.PAST, 3, de.PLURAL, de.SUBJUNCTIVE))):
self.assertEqual(de.conjugate(v1, tense), v2)
print("pattern.de.conjugate()")
def test_lexeme(self):
# Assert all inflections of "sein".
v = de.lexeme("sein")
self.assertEqual(v, [
"sein", "bin", "bist", "ist", "sind", "seid", "seiend",
"war", "warst", "waren", "wart", "gewesen",
"sei", "seien", "seiest", "seiet",
"wäre", "wärest", "wären", "wäret"
])
print("pattern.de.inflect.lexeme()")
def test_tenses(self):
# Assert tense recognition.
self.assertTrue((de.PRESENT, 3, de.SG) in de.tenses("ist"))
self.assertTrue("2sg" in de.tenses("bist"))
print("pattern.de.tenses()")
#---------------------------------------------------------------------------------------------------
class TestParser(unittest.TestCase):
def setUp(self):
pass
def test_find_lemmata(self):
# Assert lemmata for nouns, adjectives and verbs.
v = de.parser.find_lemmata([["Ich", "PRP"], ["sage", "VB"], ["schöne", "JJ"], ["Dinge", "NNS"]])
self.assertEqual(v, [
["Ich", "PRP", "ich"],
["sage", "VB", "sagen"],
["schöne", "JJ", "schön"],
["Dinge", "NNS", "ding"]])
print("pattern.de.parser.find_lemmata()")
def test_parse(self):
# Assert parsed output with Penn Treebank II tags (slash-formatted).
# 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase.
v = de.parser.parse("Der große Hund sitzt auf der Matte.")
self.assertEqual(v,
"Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O " + \
"sitzt/VB/B-VP/O " + \
"auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O"
)
# 2) "große" and "sitzt" lemmata are "groß" and "sitzen".
# Note how articles are problematic ("der" can be male subject but also plural possessive).
v = de.parser.parse("Der große Hund sitzt auf der Matte.", lemmata=True)
self.assertEqual(v,
"Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund " + \
"sitzt/VB/B-VP/O/sitzen " + \
"auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/."
)
# 3) Assert the accuracy of the German tagger.
i, n = 0, 0
for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines():
sentence = sentence.strip()
s1 = [w.split("/") for w in sentence.split(" ")]
s1 = [de.stts2penntreebank(w, pos) for w, pos in s1]
s2 = [[w for w, pos in s1]]
s2 = de.parse(s2, tokenize=False)
s2 = [w.split("/") for w in s2.split(" ")]
for j in range(len(s1)):
if s1[j][1] == s2[j][1]:
i += 1
n += 1
self.assertTrue(float(i) / n > 0.844)
print("pattern.de.parse()")
def test_tag(self):
# Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")].
v = de.tag("der grosse Hund")
self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")])
print("pattern.de.tag()")
def test_command_line(self):
# Assert parsed output from the command-line (example from the documentation).
p = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"]
p = subprocess.Popen(p, stdout=subprocess.PIPE)
p.wait()
v = p.stdout.read().decode('utf-8')
v = v.strip()
self.assertEqual(v, "Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.")
print("python -m pattern.de")
#---------------------------------------------------------------------------------------------------
def suite():
suite = unittest.TestSuite()
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
return suite
if __name__ == "__main__":
result = unittest.TextTestRunner(verbosity=1).run(suite())
sys.exit(not result.wasSuccessful())