You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
278 lines
12 KiB
Python
278 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import unicode_literals
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip, filter
|
|
from builtins import object, range
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
import unittest
|
|
import subprocess
|
|
|
|
from pattern import es
|
|
|
|
from io import open
|
|
|
|
try:
|
|
PATH = os.path.dirname(os.path.realpath(__file__))
|
|
except:
|
|
PATH = ""
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestInflection(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_pluralize(self):
|
|
# Assert the accuracy of the pluralization algorithm.
|
|
from pattern.db import Datasheet
|
|
test = {}
|
|
for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
|
|
if tag == "n":
|
|
test.setdefault(lemma, []).append(w)
|
|
i, n = 0, 0
|
|
for sg, pl in test.items():
|
|
pl = sorted(pl, key=len, reverse=True)[0]
|
|
if es.pluralize(sg) == pl:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.77)
|
|
print("pattern.es.pluralize()")
|
|
|
|
def test_singularize(self):
|
|
# Assert the accuracy of the singularization algorithm.
|
|
from pattern.db import Datasheet
|
|
test = {}
|
|
for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
|
|
if tag == "n":
|
|
test.setdefault(lemma, []).append(w)
|
|
i, n = 0, 0
|
|
for sg, pl in test.items():
|
|
pl = sorted(pl, key=len, reverse=True)[0]
|
|
if es.singularize(pl) == sg:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.93)
|
|
print("pattern.es.singularize()")
|
|
|
|
def test_attributive(self):
|
|
# Assert "alto" => "altos" (masculine, plural), and others.
|
|
for lemma, inflected, gender in (
|
|
("alto", "alto", es.MALE + es.SINGULAR),
|
|
("alto", "altos", es.MALE + es.PLURAL),
|
|
("alto", "alta", es.FEMALE + es.SINGULAR),
|
|
("alto", "altas", es.FEMALE + es.PLURAL),
|
|
("verde", "verdes", es.MALE + es.PLURAL),
|
|
("verde", "verdes", es.FEMALE + es.PLURAL)):
|
|
v = es.attributive(lemma, gender)
|
|
self.assertEqual(v, inflected)
|
|
print("pattern.es.attributive()")
|
|
|
|
def test_predicative(self):
|
|
# Assert the accuracy of the predicative algorithm ("horribles" => "horrible").
|
|
from pattern.db import Datasheet
|
|
test = {}
|
|
for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
|
|
if tag == "j":
|
|
test.setdefault(lemma, []).append(w)
|
|
i, n = 0, 0
|
|
for pred, attr in test.items():
|
|
attr = sorted(attr, key=len, reverse=True)[0]
|
|
if es.predicative(attr) == pred:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.92)
|
|
print("pattern.es.predicative()")
|
|
|
|
def test_find_lemma(self):
|
|
# Assert the accuracy of the verb lemmatization algorithm.
|
|
i, n = 0, 0
|
|
for v1, v2 in es.inflect.verbs.inflections.items():
|
|
if es.inflect.verbs.find_lemma(v1) == v2:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.80)
|
|
print("pattern.es.inflect.verbs.find_lemma()")
|
|
|
|
def test_find_lexeme(self):
|
|
# Assert the accuracy of the verb conjugation algorithm.
|
|
i, n = 0, 0
|
|
for v, lexeme1 in es.inflect.verbs.infinitives.items():
|
|
lexeme2 = es.inflect.verbs.find_lexeme(v)
|
|
for j in range(len(lexeme2)):
|
|
if lexeme1[j] == lexeme2[j]:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.85)
|
|
print("pattern.es.inflect.verbs.find_lexeme()")
|
|
|
|
def test_conjugate(self):
|
|
# Assert different tenses with different conjugations.
|
|
for (v1, v2, tense) in (
|
|
("ser", "ser", es.INFINITIVE),
|
|
("ser", "soy", (es.PRESENT, 1, es.SINGULAR)),
|
|
("ser", "eres", (es.PRESENT, 2, es.SINGULAR)),
|
|
("ser", "es", (es.PRESENT, 3, es.SINGULAR)),
|
|
("ser", "somos", (es.PRESENT, 1, es.PLURAL)),
|
|
("ser", "sois", (es.PRESENT, 2, es.PLURAL)),
|
|
("ser", "son", (es.PRESENT, 3, es.PLURAL)),
|
|
("ser", "siendo", (es.PRESENT + es.PARTICIPLE)),
|
|
("ser", "sido", (es.PAST + es.PARTICIPLE)),
|
|
("ser", "era", (es.IMPERFECT, 1, es.SINGULAR)),
|
|
("ser", "eras", (es.IMPERFECT, 2, es.SINGULAR)),
|
|
("ser", "era", (es.IMPERFECT, 3, es.SINGULAR)),
|
|
("ser", "éramos", (es.IMPERFECT, 1, es.PLURAL)),
|
|
("ser", "erais", (es.IMPERFECT, 2, es.PLURAL)),
|
|
("ser", "eran", (es.IMPERFECT, 3, es.PLURAL)),
|
|
("ser", "fui", (es.PRETERITE, 1, es.SINGULAR)),
|
|
("ser", "fuiste", (es.PRETERITE, 2, es.SINGULAR)),
|
|
("ser", "fue", (es.PRETERITE, 3, es.SINGULAR)),
|
|
("ser", "fuimos", (es.PRETERITE, 1, es.PLURAL)),
|
|
("ser", "fuisteis", (es.PRETERITE, 2, es.PLURAL)),
|
|
("ser", "fueron", (es.PRETERITE, 3, es.PLURAL)),
|
|
("ser", "sería", (es.CONDITIONAL, 1, es.SINGULAR)),
|
|
("ser", "serías", (es.CONDITIONAL, 2, es.SINGULAR)),
|
|
("ser", "sería", (es.CONDITIONAL, 3, es.SINGULAR)),
|
|
("ser", "seríamos", (es.CONDITIONAL, 1, es.PLURAL)),
|
|
("ser", "seríais", (es.CONDITIONAL, 2, es.PLURAL)),
|
|
("ser", "serían", (es.CONDITIONAL, 3, es.PLURAL)),
|
|
("ser", "seré", (es.FUTURE, 1, es.SINGULAR)),
|
|
("ser", "serás", (es.FUTURE, 2, es.SINGULAR)),
|
|
("ser", "será", (es.FUTURE, 3, es.SINGULAR)),
|
|
("ser", "seremos", (es.FUTURE, 1, es.PLURAL)),
|
|
("ser", "seréis", (es.FUTURE, 2, es.PLURAL)),
|
|
("ser", "serán", (es.FUTURE, 3, es.PLURAL)),
|
|
("ser", "sé", (es.PRESENT, 2, es.SINGULAR, es.IMPERATIVE)),
|
|
("ser", "sed", (es.PRESENT, 2, es.PLURAL, es.IMPERATIVE)),
|
|
("ser", "sea", (es.PRESENT, 1, es.SINGULAR, es.SUBJUNCTIVE)),
|
|
("ser", "seas", (es.PRESENT, 2, es.SINGULAR, es.SUBJUNCTIVE)),
|
|
("ser", "sea", (es.PRESENT, 3, es.SINGULAR, es.SUBJUNCTIVE)),
|
|
("ser", "seamos", (es.PRESENT, 1, es.PLURAL, es.SUBJUNCTIVE)),
|
|
("ser", "seáis", (es.PRESENT, 2, es.PLURAL, es.SUBJUNCTIVE)),
|
|
("ser", "sean", (es.PRESENT, 3, es.PLURAL, es.SUBJUNCTIVE)),
|
|
("ser", "fuera", (es.PAST, 1, es.SINGULAR, es.SUBJUNCTIVE)),
|
|
("ser", "fueras", (es.PAST, 2, es.SINGULAR, es.SUBJUNCTIVE)),
|
|
("ser", "fuera", (es.PAST, 3, es.SINGULAR, es.SUBJUNCTIVE)),
|
|
("ser", "fuéramos", (es.PAST, 1, es.PLURAL, es.SUBJUNCTIVE)),
|
|
("ser", "fuerais", (es.PAST, 2, es.PLURAL, es.SUBJUNCTIVE)),
|
|
("ser", "fueran", (es.PAST, 3, es.PLURAL, es.SUBJUNCTIVE))):
|
|
self.assertEqual(es.conjugate(v1, tense), v2)
|
|
print("pattern.es.conjugate()")
|
|
|
|
def test_lexeme(self):
|
|
# Assert all inflections of "ser".
|
|
v = es.lexeme("ser")
|
|
self.assertEqual(v, [
|
|
'ser', 'soy', 'eres', 'es', 'somos', 'sois', 'son', 'siendo',
|
|
'fui', 'fuiste', 'fue', 'fuimos', 'fuisteis', 'fueron', 'sido',
|
|
'era', 'eras', 'éramos', 'erais', 'eran',
|
|
'seré', 'serás', 'será', 'seremos', 'seréis', 'serán',
|
|
'sería', 'serías', 'seríamos', 'seríais', 'serían',
|
|
'sé', 'sed',
|
|
'sea', 'seas', 'seamos', 'seáis', 'sean',
|
|
'fuera', 'fueras', 'fuéramos', 'fuerais', 'fueran'
|
|
])
|
|
print("pattern.es.inflect.lexeme()")
|
|
|
|
def test_tenses(self):
|
|
# Assert tense recognition.
|
|
self.assertTrue((es.PRESENT, 3, es.SG) in es.tenses("es"))
|
|
self.assertTrue("2sg" in es.tenses("eres"))
|
|
# The CONDITIONAL is sometimes described as a mood,
|
|
# and sometimes as a tense of the indicative mood (e.g., in Spanish):
|
|
t1 = (es.CONDITIONAL, 1, es.SG)
|
|
t2 = (es.PRESENT, 1, es.SG, es.CONDITIONAL)
|
|
self.assertTrue("1sg->" in es.tenses("sería"))
|
|
self.assertTrue(t1 in es.tenses("sería"))
|
|
self.assertTrue(t2 in es.tenses("sería"))
|
|
self.assertTrue(t1 in es.tenses(es.conjugate("ser", mood=es.INDICATIVE, tense=es.CONDITIONAL)))
|
|
self.assertTrue(t2 in es.tenses(es.conjugate("ser", mood=es.CONDITIONAL)))
|
|
print("pattern.es.tenses()")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestParser(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_find_lemmata(self):
|
|
# Assert lemmata for nouns, adjectives, verbs and determiners.
|
|
v = es.parser.find_lemmata([
|
|
["Los", "DT"], ["gatos", "NNS"], ["negros", "JJ"], ["se", "PRP"], ["sentó", "VB"],
|
|
["en", "IN"], ["la", "DT"], ["alfombra", "NN"]])
|
|
self.assertEqual(v, [
|
|
["Los", "DT", "el"],
|
|
["gatos", "NNS", "gato"],
|
|
["negros", "JJ", "negro"],
|
|
["se", "PRP", "se"],
|
|
["sentó", "VB", "sentar"],
|
|
["en", "IN", "en"],
|
|
["la", "DT", "el"],
|
|
["alfombra", "NN", "alfombra"]])
|
|
print("pattern.es.parser.find_lemmata()")
|
|
|
|
def test_parse(self):
|
|
# Assert parsed output with Penn Treebank II tags (slash-formatted).
|
|
# "el gato negro" is a noun phrase, "en la alfombra" is a prepositional noun phrase.
|
|
v = es.parser.parse("El gato negro se sentó en la alfombra.")
|
|
self.assertEqual(v, # XXX - shouldn't "se" be part of the verb phrase?
|
|
"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " + \
|
|
"se/PRP/B-NP/O sentó/VB/B-VP/O " + \
|
|
"en/IN/B-PP/B-PNP la/DT/B-NP/I-PNP alfombra/NN/I-NP/I-PNP ././O/O"
|
|
)
|
|
# Assert the accuracy of the Spanish tagger.
|
|
i, n = 0, 0
|
|
for sentence in open(os.path.join(PATH, "corpora", "tagged-es-wikicorpus.txt")).readlines():
|
|
sentence = sentence.strip()
|
|
s1 = [w.split("/") for w in sentence.split(" ")]
|
|
s2 = [[w for w, pos in s1]]
|
|
s2 = es.parse(s2, tokenize=False, tagset=es.PAROLE)
|
|
s2 = [w.split("/") for w in s2.split(" ")]
|
|
for j in range(len(s1)):
|
|
if s1[j][1] == s2[j][1]:
|
|
i += 1
|
|
n += 1
|
|
#print(float(i) / n)
|
|
self.assertTrue(float(i) / n > 0.92)
|
|
print("pattern.es.parser.parse()")
|
|
|
|
def test_tag(self):
|
|
# Assert [("el", "DT"), ("gato", "NN"), ("negro", "JJ")].
|
|
v = es.tag("el gato negro")
|
|
self.assertEqual(v, [("el", "DT"), ("gato", "NN"), ("negro", "JJ")])
|
|
print("pattern.es.tag()")
|
|
|
|
def test_command_line(self):
|
|
# Assert parsed output from the command-line (example from the documentation).
|
|
p = ["python", "-m", "pattern.es", "-s", "El gato negro.", "-OTCRL"]
|
|
p = subprocess.Popen(p, stdout=subprocess.PIPE)
|
|
p.wait()
|
|
v = p.stdout.read().decode('utf-8')
|
|
v = v.strip()
|
|
self.assertEqual(v, "El/DT/B-NP/O/O/el gato/NN/I-NP/O/O/gato negro/JJ/I-NP/O/O/negro ././O/O/O/.")
|
|
print("python -m pattern.es")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
def suite():
|
|
suite = unittest.TestSuite()
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
|
|
return suite
|
|
|
|
if __name__ == "__main__":
|
|
|
|
result = unittest.TextTestRunner(verbosity=1).run(suite())
|
|
sys.exit(not result.wasSuccessful())
|