You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
258 lines
11 KiB
Python
258 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import unicode_literals
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip, filter
|
|
from builtins import object, range
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
import unittest
|
|
import subprocess
|
|
|
|
from pattern import de
|
|
|
|
from io import open
|
|
|
|
try:
|
|
PATH = os.path.dirname(os.path.realpath(__file__))
|
|
except:
|
|
PATH = ""
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestInflection(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_gender(self):
|
|
# Assert der Hund => MASCULINE
|
|
# Assert die Studentin => FEMININE
|
|
# Assert das Auto => NEUTRAL
|
|
self.assertEqual(de.gender("Hund"), de.MASCULINE)
|
|
self.assertEqual(de.gender("Studentin"), de.FEMININE)
|
|
self.assertEqual(de.gender("Auto"), de.NEUTRAL)
|
|
|
|
def test_pluralize(self):
|
|
# Assert the accuracy of the pluralization algorithm.
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
|
|
if tag == "n":
|
|
if de.pluralize(sg) == pl:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.69)
|
|
print("pattern.de.pluralize()")
|
|
|
|
def test_singularize(self):
|
|
# Assert the accuracy of the singularization algorithm.
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
|
|
if tag == "n":
|
|
if de.singularize(pl) == sg:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.82)
|
|
print("pattern.de.singularize()")
|
|
|
|
def test_attributive(self):
|
|
# Assert "groß" => "großer" (masculine, nominative), and others.
|
|
for lemma, inflected, gender, role, article in (
|
|
("groß", "großer", de.MALE, de.SUBJECT, None),
|
|
("groß", "großen", de.MALE, de.OBJECT, None),
|
|
("groß", "großem", de.MALE, de.INDIRECT, None),
|
|
("groß", "großen", de.MALE, de.PROPERTY, None),
|
|
("groß", "große", de.FEMALE, de.SUBJECT, None),
|
|
("groß", "große", de.FEMALE, de.OBJECT, None),
|
|
("groß", "großer", de.FEMALE, de.INDIRECT, None),
|
|
("groß", "großes", de.NEUTRAL, de.SUBJECT, None),
|
|
("groß", "großes", de.NEUTRAL, de.OBJECT, None),
|
|
("groß", "großen", de.MALE, de.PROPERTY, "mein"),
|
|
("groß", "großen", de.FEMALE, de.PROPERTY, "jeder"),
|
|
("groß", "großen", de.FEMALE, de.PROPERTY, "mein"),
|
|
("groß", "großen", de.PLURAL, de.INDIRECT, "jede"),
|
|
("groß", "großen", de.PLURAL, de.PROPERTY, "jeder")):
|
|
v = de.attributive(lemma, gender, role, article)
|
|
self.assertEqual(v, inflected)
|
|
print("pattern.de.attributive()")
|
|
|
|
def test_predicative(self):
|
|
# Assert the accuracy of the predicative algorithm ("großer" => "groß").
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
|
|
if tag == "a":
|
|
if de.predicative(attr) == pred:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.98)
|
|
print("pattern.de.predicative()")
|
|
|
|
def test_find_lemma(self):
|
|
# Assert the accuracy of the verb lemmatization algorithm.
|
|
# Note: the accuracy is higher (88%) when measured on CELEX word forms
|
|
# (presumably because de.inflect.verbs has high percentage irregular verbs).
|
|
i, n = 0, 0
|
|
for v1, v2 in de.inflect.verbs.inflections.items():
|
|
if de.inflect.verbs.find_lemma(v1) == v2:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.86)
|
|
print("pattern.de.inflect.verbs.find_lemma()")
|
|
|
|
def test_find_lexeme(self):
|
|
# Assert the accuracy of the verb conjugation algorithm.
|
|
i, n = 0, 0
|
|
for v, lexeme1 in de.inflect.verbs.infinitives.items():
|
|
lexeme2 = de.inflect.verbs.find_lexeme(v)
|
|
for j in range(len(lexeme2)):
|
|
if lexeme1[j] == "":
|
|
continue
|
|
if lexeme1[j] == lexeme2[j]:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.86)
|
|
print("pattern.de.inflect.verbs.find_lexeme()")
|
|
|
|
def test_conjugate(self):
|
|
# Assert different tenses with different conjugations.
|
|
for (v1, v2, tense) in (
|
|
("sein", "sein", de.INFINITIVE),
|
|
("sein", "bin", (de.PRESENT, 1, de.SINGULAR)),
|
|
("sein", "bist", (de.PRESENT, 2, de.SINGULAR)),
|
|
("sein", "ist", (de.PRESENT, 3, de.SINGULAR)),
|
|
("sein", "sind", (de.PRESENT, 1, de.PLURAL)),
|
|
("sein", "seid", (de.PRESENT, 2, de.PLURAL)),
|
|
("sein", "sind", (de.PRESENT, 3, de.PLURAL)),
|
|
("sein", "seiend", (de.PRESENT + de.PARTICIPLE)),
|
|
("sein", "war", (de.PAST, 1, de.SINGULAR)),
|
|
("sein", "warst", (de.PAST, 2, de.SINGULAR)),
|
|
("sein", "war", (de.PAST, 3, de.SINGULAR)),
|
|
("sein", "waren", (de.PAST, 1, de.PLURAL)),
|
|
("sein", "wart", (de.PAST, 2, de.PLURAL)),
|
|
("sein", "waren", (de.PAST, 3, de.PLURAL)),
|
|
("sein", "gewesen", (de.PAST + de.PARTICIPLE)),
|
|
("sein", "sei", (de.PRESENT, 2, de.SINGULAR, de.IMPERATIVE)),
|
|
("sein", "seien", (de.PRESENT, 1, de.PLURAL, de.IMPERATIVE)),
|
|
("sein", "seid", (de.PRESENT, 2, de.PLURAL, de.IMPERATIVE)),
|
|
("sein", "sei", (de.PRESENT, 1, de.SINGULAR, de.SUBJUNCTIVE)),
|
|
("sein", "seiest", (de.PRESENT, 2, de.SINGULAR, de.SUBJUNCTIVE)),
|
|
("sein", "sei", (de.PRESENT, 3, de.SINGULAR, de.SUBJUNCTIVE)),
|
|
("sein", "seien", (de.PRESENT, 1, de.PLURAL, de.SUBJUNCTIVE)),
|
|
("sein", "seiet", (de.PRESENT, 2, de.PLURAL, de.SUBJUNCTIVE)),
|
|
("sein", "seien", (de.PRESENT, 3, de.PLURAL, de.SUBJUNCTIVE)),
|
|
("sein", "wäre", (de.PAST, 1, de.SINGULAR, de.SUBJUNCTIVE)),
|
|
("sein", "wärest", (de.PAST, 2, de.SINGULAR, de.SUBJUNCTIVE)),
|
|
("sein", "wäre", (de.PAST, 3, de.SINGULAR, de.SUBJUNCTIVE)),
|
|
("sein", "wären", (de.PAST, 1, de.PLURAL, de.SUBJUNCTIVE)),
|
|
("sein", "wäret", (de.PAST, 2, de.PLURAL, de.SUBJUNCTIVE)),
|
|
("sein", "wären", (de.PAST, 3, de.PLURAL, de.SUBJUNCTIVE))):
|
|
self.assertEqual(de.conjugate(v1, tense), v2)
|
|
print("pattern.de.conjugate()")
|
|
|
|
def test_lexeme(self):
|
|
# Assert all inflections of "sein".
|
|
v = de.lexeme("sein")
|
|
self.assertEqual(v, [
|
|
"sein", "bin", "bist", "ist", "sind", "seid", "seiend",
|
|
"war", "warst", "waren", "wart", "gewesen",
|
|
"sei", "seien", "seiest", "seiet",
|
|
"wäre", "wärest", "wären", "wäret"
|
|
])
|
|
print("pattern.de.inflect.lexeme()")
|
|
|
|
def test_tenses(self):
|
|
# Assert tense recognition.
|
|
self.assertTrue((de.PRESENT, 3, de.SG) in de.tenses("ist"))
|
|
self.assertTrue("2sg" in de.tenses("bist"))
|
|
print("pattern.de.tenses()")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestParser(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_find_lemmata(self):
|
|
# Assert lemmata for nouns, adjectives and verbs.
|
|
v = de.parser.find_lemmata([["Ich", "PRP"], ["sage", "VB"], ["schöne", "JJ"], ["Dinge", "NNS"]])
|
|
self.assertEqual(v, [
|
|
["Ich", "PRP", "ich"],
|
|
["sage", "VB", "sagen"],
|
|
["schöne", "JJ", "schön"],
|
|
["Dinge", "NNS", "ding"]])
|
|
print("pattern.de.parser.find_lemmata()")
|
|
|
|
def test_parse(self):
|
|
# Assert parsed output with Penn Treebank II tags (slash-formatted).
|
|
# 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase.
|
|
v = de.parser.parse("Der große Hund sitzt auf der Matte.")
|
|
self.assertEqual(v,
|
|
"Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O " + \
|
|
"sitzt/VB/B-VP/O " + \
|
|
"auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O"
|
|
)
|
|
# 2) "große" and "sitzt" lemmata are "groß" and "sitzen".
|
|
# Note how articles are problematic ("der" can be male subject but also plural possessive).
|
|
v = de.parser.parse("Der große Hund sitzt auf der Matte.", lemmata=True)
|
|
self.assertEqual(v,
|
|
"Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund " + \
|
|
"sitzt/VB/B-VP/O/sitzen " + \
|
|
"auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/."
|
|
)
|
|
# 3) Assert the accuracy of the German tagger.
|
|
i, n = 0, 0
|
|
for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines():
|
|
sentence = sentence.strip()
|
|
s1 = [w.split("/") for w in sentence.split(" ")]
|
|
s1 = [de.stts2penntreebank(w, pos) for w, pos in s1]
|
|
s2 = [[w for w, pos in s1]]
|
|
s2 = de.parse(s2, tokenize=False)
|
|
s2 = [w.split("/") for w in s2.split(" ")]
|
|
for j in range(len(s1)):
|
|
if s1[j][1] == s2[j][1]:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.844)
|
|
print("pattern.de.parse()")
|
|
|
|
def test_tag(self):
|
|
# Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")].
|
|
v = de.tag("der grosse Hund")
|
|
self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")])
|
|
print("pattern.de.tag()")
|
|
|
|
def test_command_line(self):
|
|
# Assert parsed output from the command-line (example from the documentation).
|
|
p = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"]
|
|
p = subprocess.Popen(p, stdout=subprocess.PIPE)
|
|
p.wait()
|
|
v = p.stdout.read().decode('utf-8')
|
|
v = v.strip()
|
|
self.assertEqual(v, "Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.")
|
|
print("python -m pattern.de")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
def suite():
|
|
suite = unittest.TestSuite()
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
|
|
return suite
|
|
|
|
if __name__ == "__main__":
|
|
|
|
result = unittest.TextTestRunner(verbosity=1).run(suite())
|
|
sys.exit(not result.wasSuccessful())
|