# -*- coding: utf-8 -*- from __future__ import unicode_literals from __future__ import print_function from __future__ import division from builtins import str, bytes, dict, int from builtins import map, zip, filter from builtins import object, range import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import unittest import subprocess from pattern import de from io import open try: PATH = os.path.dirname(os.path.realpath(__file__)) except: PATH = "" #--------------------------------------------------------------------------------------------------- class TestInflection(unittest.TestCase): def setUp(self): pass def test_gender(self): # Assert der Hund => MASCULINE # Assert die Studentin => FEMININE # Assert das Auto => NEUTRAL self.assertEqual(de.gender("Hund"), de.MASCULINE) self.assertEqual(de.gender("Studentin"), de.FEMININE) self.assertEqual(de.gender("Auto"), de.NEUTRAL) def test_pluralize(self): # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "n": if de.pluralize(sg) == pl: i += 1 n += 1 self.assertTrue(float(i) / n > 0.69) print("pattern.de.pluralize()") def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "n": if de.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.82) print("pattern.de.singularize()") def test_attributive(self): # Assert "groß" => "großer" (masculine, nominative), and others. for lemma, inflected, gender, role, article in ( ("groß", "großer", de.MALE, de.SUBJECT, None), ("groß", "großen", de.MALE, de.OBJECT, None), ("groß", "großem", de.MALE, de.INDIRECT, None), ("groß", "großen", de.MALE, de.PROPERTY, None), ("groß", "große", de.FEMALE, de.SUBJECT, None), ("groß", "große", de.FEMALE, de.OBJECT, None), ("groß", "großer", de.FEMALE, de.INDIRECT, None), ("groß", "großes", de.NEUTRAL, de.SUBJECT, None), ("groß", "großes", de.NEUTRAL, de.OBJECT, None), ("groß", "großen", de.MALE, de.PROPERTY, "mein"), ("groß", "großen", de.FEMALE, de.PROPERTY, "jeder"), ("groß", "großen", de.FEMALE, de.PROPERTY, "mein"), ("groß", "großen", de.PLURAL, de.INDIRECT, "jede"), ("groß", "großen", de.PLURAL, de.PROPERTY, "jeder")): v = de.attributive(lemma, gender, role, article) self.assertEqual(v, inflected) print("pattern.de.attributive()") def test_predicative(self): # Assert the accuracy of the predicative algorithm ("großer" => "groß"). from pattern.db import Datasheet i, n = 0, 0 for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "a": if de.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.98) print("pattern.de.predicative()") def test_find_lemma(self): # Assert the accuracy of the verb lemmatization algorithm. # Note: the accuracy is higher (88%) when measured on CELEX word forms # (presumably because de.inflect.verbs has high percentage irregular verbs). i, n = 0, 0 for v1, v2 in de.inflect.verbs.inflections.items(): if de.inflect.verbs.find_lemma(v1) == v2: i += 1 n += 1 self.assertTrue(float(i) / n > 0.86) print("pattern.de.inflect.verbs.find_lemma()") def test_find_lexeme(self): # Assert the accuracy of the verb conjugation algorithm. i, n = 0, 0 for v, lexeme1 in de.inflect.verbs.infinitives.items(): lexeme2 = de.inflect.verbs.find_lexeme(v) for j in range(len(lexeme2)): if lexeme1[j] == "": continue if lexeme1[j] == lexeme2[j]: i += 1 n += 1 self.assertTrue(float(i) / n > 0.86) print("pattern.de.inflect.verbs.find_lexeme()") def test_conjugate(self): # Assert different tenses with different conjugations. for (v1, v2, tense) in ( ("sein", "sein", de.INFINITIVE), ("sein", "bin", (de.PRESENT, 1, de.SINGULAR)), ("sein", "bist", (de.PRESENT, 2, de.SINGULAR)), ("sein", "ist", (de.PRESENT, 3, de.SINGULAR)), ("sein", "sind", (de.PRESENT, 1, de.PLURAL)), ("sein", "seid", (de.PRESENT, 2, de.PLURAL)), ("sein", "sind", (de.PRESENT, 3, de.PLURAL)), ("sein", "seiend", (de.PRESENT + de.PARTICIPLE)), ("sein", "war", (de.PAST, 1, de.SINGULAR)), ("sein", "warst", (de.PAST, 2, de.SINGULAR)), ("sein", "war", (de.PAST, 3, de.SINGULAR)), ("sein", "waren", (de.PAST, 1, de.PLURAL)), ("sein", "wart", (de.PAST, 2, de.PLURAL)), ("sein", "waren", (de.PAST, 3, de.PLURAL)), ("sein", "gewesen", (de.PAST + de.PARTICIPLE)), ("sein", "sei", (de.PRESENT, 2, de.SINGULAR, de.IMPERATIVE)), ("sein", "seien", (de.PRESENT, 1, de.PLURAL, de.IMPERATIVE)), ("sein", "seid", (de.PRESENT, 2, de.PLURAL, de.IMPERATIVE)), ("sein", "sei", (de.PRESENT, 1, de.SINGULAR, de.SUBJUNCTIVE)), ("sein", "seiest", (de.PRESENT, 2, de.SINGULAR, de.SUBJUNCTIVE)), ("sein", "sei", (de.PRESENT, 3, de.SINGULAR, de.SUBJUNCTIVE)), ("sein", "seien", (de.PRESENT, 1, de.PLURAL, de.SUBJUNCTIVE)), ("sein", "seiet", (de.PRESENT, 2, de.PLURAL, de.SUBJUNCTIVE)), ("sein", "seien", (de.PRESENT, 3, de.PLURAL, de.SUBJUNCTIVE)), ("sein", "wäre", (de.PAST, 1, de.SINGULAR, de.SUBJUNCTIVE)), ("sein", "wärest", (de.PAST, 2, de.SINGULAR, de.SUBJUNCTIVE)), ("sein", "wäre", (de.PAST, 3, de.SINGULAR, de.SUBJUNCTIVE)), ("sein", "wären", (de.PAST, 1, de.PLURAL, de.SUBJUNCTIVE)), ("sein", "wäret", (de.PAST, 2, de.PLURAL, de.SUBJUNCTIVE)), ("sein", "wären", (de.PAST, 3, de.PLURAL, de.SUBJUNCTIVE))): self.assertEqual(de.conjugate(v1, tense), v2) print("pattern.de.conjugate()") def test_lexeme(self): # Assert all inflections of "sein". v = de.lexeme("sein") self.assertEqual(v, [ "sein", "bin", "bist", "ist", "sind", "seid", "seiend", "war", "warst", "waren", "wart", "gewesen", "sei", "seien", "seiest", "seiet", "wäre", "wärest", "wären", "wäret" ]) print("pattern.de.inflect.lexeme()") def test_tenses(self): # Assert tense recognition. self.assertTrue((de.PRESENT, 3, de.SG) in de.tenses("ist")) self.assertTrue("2sg" in de.tenses("bist")) print("pattern.de.tenses()") #--------------------------------------------------------------------------------------------------- class TestParser(unittest.TestCase): def setUp(self): pass def test_find_lemmata(self): # Assert lemmata for nouns, adjectives and verbs. v = de.parser.find_lemmata([["Ich", "PRP"], ["sage", "VB"], ["schöne", "JJ"], ["Dinge", "NNS"]]) self.assertEqual(v, [ ["Ich", "PRP", "ich"], ["sage", "VB", "sagen"], ["schöne", "JJ", "schön"], ["Dinge", "NNS", "ding"]]) print("pattern.de.parser.find_lemmata()") def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # 1) "der große Hund" is a noun phrase, "auf der Matte" is a prepositional noun phrase. v = de.parser.parse("Der große Hund sitzt auf der Matte.") self.assertEqual(v, "Der/DT/B-NP/O große/JJ/I-NP/O Hund/NN/I-NP/O " + \ "sitzt/VB/B-VP/O " + \ "auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O" ) # 2) "große" and "sitzt" lemmata are "groß" and "sitzen". # Note how articles are problematic ("der" can be male subject but also plural possessive). v = de.parser.parse("Der große Hund sitzt auf der Matte.", lemmata=True) self.assertEqual(v, "Der/DT/B-NP/O/der große/JJ/I-NP/O/groß Hund/NN/I-NP/O/hund " + \ "sitzt/VB/B-VP/O/sitzen " + \ "auf/IN/B-PP/B-PNP/auf der/DT/B-NP/I-PNP/der Matte/NN/I-NP/I-PNP/matte ././O/O/." ) # 3) Assert the accuracy of the German tagger. i, n = 0, 0 for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines(): sentence = sentence.strip() s1 = [w.split("/") for w in sentence.split(" ")] s1 = [de.stts2penntreebank(w, pos) for w, pos in s1] s2 = [[w for w, pos in s1]] s2 = de.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): if s1[j][1] == s2[j][1]: i += 1 n += 1 self.assertTrue(float(i) / n > 0.844) print("pattern.de.parse()") def test_tag(self): # Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")]. v = de.tag("der grosse Hund") self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")]) print("pattern.de.tag()") def test_command_line(self): # Assert parsed output from the command-line (example from the documentation). p = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"] p = subprocess.Popen(p, stdout=subprocess.PIPE) p.wait() v = p.stdout.read().decode('utf-8') v = v.strip() self.assertEqual(v, "Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.") print("python -m pattern.de") #--------------------------------------------------------------------------------------------------- def suite(): suite = unittest.TestSuite() suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection)) suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser)) return suite if __name__ == "__main__": result = unittest.TextTestRunner(verbosity=1).run(suite()) sys.exit(not result.wasSuccessful())