You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
291 lines
12 KiB
Python
291 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import unicode_literals
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip, filter
|
|
from builtins import object, range
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
import unittest
|
|
import subprocess
|
|
|
|
from pattern import it
|
|
|
|
from io import open
|
|
|
|
try:
|
|
PATH = os.path.dirname(os.path.realpath(__file__))
|
|
except:
|
|
PATH = ""
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestInflection(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_article(self):
|
|
# Assert definite and indefinite article inflection.
|
|
for a, n, g in (
|
|
("il" , "giorno" , it.M),
|
|
("l'" , "altro giorno", it.M),
|
|
("lo" , "zio" , it.M),
|
|
("l'" , "amica" , it.F),
|
|
("la" , "nouva amica" , it.F),
|
|
("i" , "giapponesi" , it.M + it.PL),
|
|
("gli", "italiani" , it.M + it.PL),
|
|
("gli", "zii" , it.M + it.PL),
|
|
("le" , "zie" , it.F + it.PL)):
|
|
v = it.article(n, "definite", gender=g)
|
|
self.assertEqual(a, v)
|
|
for a, n, g in (
|
|
("uno", "zio" , it.M),
|
|
("una", "zia" , it.F),
|
|
("un" , "amico", it.M),
|
|
("un'", "amica", it.F)):
|
|
v = it.article(n, "indefinite", gender=g)
|
|
self.assertEqual(a, v)
|
|
v = it.referenced("amica", gender="f")
|
|
self.assertEqual(v, "un'amica")
|
|
print("pattern.it.article()")
|
|
print("pattern.it.referenced()")
|
|
|
|
def test_gender(self):
|
|
# Assert the accuracy of the gender disambiguation algorithm.
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
|
|
g = it.gender(sg)
|
|
if mf in g and it.PLURAL not in g:
|
|
i += 1
|
|
g = it.gender(pl)
|
|
if mf in g and it.PLURAL in g:
|
|
i += 1
|
|
n += 2
|
|
self.assertTrue(float(i) / n > 0.92)
|
|
print("pattern.it.gender()")
|
|
|
|
def test_pluralize(self):
|
|
# Assert the accuracy of the pluralization algorithm.
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
|
|
if it.pluralize(sg) == pl:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.93)
|
|
print("pattern.it.pluralize()")
|
|
|
|
def test_singularize(self):
|
|
# Assert the accuracy of the singularization algorithm.
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
|
|
if it.singularize(pl) == sg:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.84)
|
|
print("pattern.it.singularize()")
|
|
|
|
def test_predicative(self):
|
|
# Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale").
|
|
|
|
from pattern.db import Datasheet
|
|
i, n = 0, 0
|
|
for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
|
|
if pos != "j":
|
|
continue
|
|
if it.predicative(pl) == sg:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.87)
|
|
print("pattern.it.predicative()")
|
|
|
|
def test_find_lemma(self):
|
|
# Assert the accuracy of the verb lemmatization algorithm.
|
|
i, n = 0, 0
|
|
r = 0
|
|
for v1, v2 in it.inflect.verbs.inflections.items():
|
|
if it.inflect.verbs.find_lemma(v1) == v2:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.81)
|
|
print("pattern.it.inflect.verbs.find_lemma()")
|
|
|
|
def test_find_lexeme(self):
|
|
# Assert the accuracy of the verb conjugation algorithm.
|
|
i, n = 0, 0
|
|
for v, lexeme1 in it.inflect.verbs.infinitives.items():
|
|
lexeme2 = it.inflect.verbs.find_lexeme(v)
|
|
for j in range(len(lexeme2)):
|
|
if lexeme1[j] == lexeme2[j]:
|
|
i += 1
|
|
n += 1
|
|
self.assertTrue(float(i) / n > 0.89)
|
|
print("pattern.it.inflect.verbs.find_lexeme()")
|
|
|
|
def test_conjugate(self):
|
|
# Assert different tenses with different conjugations.
|
|
for (v1, v2, tense) in (
|
|
("essere", "essere", it.INFINITIVE),
|
|
("essere", "sono", (it.PRESENT, 1, it.SINGULAR)),
|
|
("essere", "sei", (it.PRESENT, 2, it.SINGULAR)),
|
|
("essere", "è", (it.PRESENT, 3, it.SINGULAR)),
|
|
("essere", "siamo", (it.PRESENT, 1, it.PLURAL)),
|
|
("essere", "siete", (it.PRESENT, 2, it.PLURAL)),
|
|
("essere", "sono", (it.PRESENT, 3, it.PLURAL)),
|
|
("essere", "essendo", (it.PRESENT + it.PARTICIPLE)),
|
|
("essere", "stato", (it.PAST + it.PARTICIPLE)),
|
|
("essere", "ero", (it.IMPERFECT, 1, it.SINGULAR)),
|
|
("essere", "eri", (it.IMPERFECT, 2, it.SINGULAR)),
|
|
("essere", "era", (it.IMPERFECT, 3, it.SINGULAR)),
|
|
("essere", "eravamo", (it.IMPERFECT, 1, it.PLURAL)),
|
|
("essere", "eravate", (it.IMPERFECT, 2, it.PLURAL)),
|
|
("essere", "erano", (it.IMPERFECT, 3, it.PLURAL)),
|
|
("essere", "fui", (it.PRETERITE, 1, it.SINGULAR)),
|
|
("essere", "fosti", (it.PRETERITE, 2, it.SINGULAR)),
|
|
("essere", "fu", (it.PRETERITE, 3, it.SINGULAR)),
|
|
("essere", "fummo", (it.PRETERITE, 1, it.PLURAL)),
|
|
("essere", "foste", (it.PRETERITE, 2, it.PLURAL)),
|
|
("essere", "furono", (it.PRETERITE, 3, it.PLURAL)),
|
|
("essere", "sarei", (it.CONDITIONAL, 1, it.SINGULAR)),
|
|
("essere", "saresti", (it.CONDITIONAL, 2, it.SINGULAR)),
|
|
("essere", "sarebbe", (it.CONDITIONAL, 3, it.SINGULAR)),
|
|
("essere", "saremmo", (it.CONDITIONAL, 1, it.PLURAL)),
|
|
("essere", "sareste", (it.CONDITIONAL, 2, it.PLURAL)),
|
|
("essere", "sarebbero", (it.CONDITIONAL, 3, it.PLURAL)),
|
|
("essere", "sarò", (it.FUTURE, 1, it.SINGULAR)),
|
|
("essere", "sarai", (it.FUTURE, 2, it.SINGULAR)),
|
|
("essere", "sarà", (it.FUTURE, 3, it.SINGULAR)),
|
|
("essere", "saremo", (it.FUTURE, 1, it.PLURAL)),
|
|
("essere", "sarete", (it.FUTURE, 2, it.PLURAL)),
|
|
("essere", "saranno", (it.FUTURE, 3, it.PLURAL)),
|
|
("essere", "sii", (it.PRESENT, 2, it.SINGULAR, it.IMPERATIVE)),
|
|
("essere", "sia", (it.PRESENT, 3, it.SINGULAR, it.IMPERATIVE)),
|
|
("essere", "siamo", (it.PRESENT, 1, it.PLURAL, it.IMPERATIVE)),
|
|
("essere", "siate", (it.PRESENT, 2, it.PLURAL, it.IMPERATIVE)),
|
|
("essere", "siano", (it.PRESENT, 3, it.PLURAL, it.IMPERATIVE)),
|
|
("essere", "sia", (it.PRESENT, 1, it.SINGULAR, it.SUBJUNCTIVE)),
|
|
("essere", "sia", (it.PRESENT, 2, it.SINGULAR, it.SUBJUNCTIVE)),
|
|
("essere", "sia", (it.PRESENT, 3, it.SINGULAR, it.SUBJUNCTIVE)),
|
|
("essere", "siamo", (it.PRESENT, 1, it.PLURAL, it.SUBJUNCTIVE)),
|
|
("essere", "siate", (it.PRESENT, 2, it.PLURAL, it.SUBJUNCTIVE)),
|
|
("essere", "siano", (it.PRESENT, 3, it.PLURAL, it.SUBJUNCTIVE)),
|
|
("essere", "fossi", (it.PAST, 1, it.SINGULAR, it.SUBJUNCTIVE)),
|
|
("essere", "fossi", (it.PAST, 2, it.SINGULAR, it.SUBJUNCTIVE)),
|
|
("essere", "fosse", (it.PAST, 3, it.SINGULAR, it.SUBJUNCTIVE)),
|
|
("essere", "fossimo", (it.PAST, 1, it.PLURAL, it.SUBJUNCTIVE)),
|
|
("essere", "foste", (it.PAST, 2, it.PLURAL, it.SUBJUNCTIVE)),
|
|
("essere", "fossero", (it.PAST, 3, it.PLURAL, it.SUBJUNCTIVE))):
|
|
self.assertEqual(it.conjugate(v1, tense), v2)
|
|
print("pattern.it.conjugate()")
|
|
|
|
def test_lexeme(self):
|
|
# Assert all inflections of "essere".
|
|
v = it.lexeme("essere")
|
|
self.assertEqual(v, [
|
|
'essere', 'sono', 'sei', 'è', 'siamo', 'siete', 'essendo',
|
|
'fui', 'fosti', 'fu', 'fummo', 'foste', 'furono', 'stato',
|
|
'ero', 'eri', 'era', 'eravamo', 'eravate', 'erano',
|
|
'sarò', 'sarai', 'sarà', 'saremo', 'sarete', 'saranno',
|
|
'sarei', 'saresti', 'sarebbe', 'saremmo', 'sareste', 'sarebbero',
|
|
'sii', 'sia', 'siate', 'siano',
|
|
'fossi', 'fosse', 'fossimo', 'fossero'
|
|
])
|
|
print("pattern.it.inflect.lexeme()")
|
|
|
|
def test_tenses(self):
|
|
# Assert tense recognition.
|
|
self.assertTrue((it.PRESENT, 3, it.SG) in it.tenses("è"))
|
|
self.assertTrue("2sg" in it.tenses("sei"))
|
|
print("pattern.it.tenses()")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
class TestParser(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
pass
|
|
|
|
def test_find_lemmata(self):
|
|
# Assert lemmata for nouns, adjectives, verbs and determiners.
|
|
v = it.parser.find_lemmata([
|
|
["I", "DT"], ["gatti", "NNS"], ["neri", "JJ"],
|
|
["seduti", "VB"], ["sul", "IN"], ["tatami", "NN"]])
|
|
self.assertEqual(v, [
|
|
["I", "DT", "il"],
|
|
["gatti", "NNS", "gatto"],
|
|
["neri", "JJ", "nero"],
|
|
["seduti", "VB", "sedutare"],
|
|
["sul", "IN", "sul"],
|
|
["tatami", "NN", "tatami"]])
|
|
print("pattern.it.parser.find_lemmata()")
|
|
|
|
def test_parse(self):
|
|
# Assert parsed output with Penn Treebank II tags (slash-formatted).
|
|
# "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase.
|
|
v = it.parser.parse("Il gatto nero seduto sulla stuoia.")
|
|
self.assertEqual(v,
|
|
"Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " +
|
|
"seduto/VB/B-VP/O " + \
|
|
"sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O"
|
|
)
|
|
# Assert the accuracy of the Italian tagger.
|
|
i, n = 0, 0
|
|
for sentence in open(os.path.join(PATH, "corpora", "tagged-it-wacky.txt")).readlines():
|
|
sentence = sentence.strip()
|
|
s1 = [w.split("/") for w in sentence.split(" ")]
|
|
s2 = [[w for w, pos in s1]]
|
|
s2 = it.parse(s2, tokenize=False)
|
|
s2 = [w.split("/") for w in s2.split(" ")]
|
|
for j in range(len(s1)):
|
|
t1 = s1[j][1]
|
|
t2 = s2[j][1]
|
|
# WaCKy test set tags plural nouns as "NN", pattern.it as "NNS".
|
|
# Some punctuation marks are also tagged differently,
|
|
# but these are not necessarily errors.
|
|
if t1 == t2 or (t1 == "NN" and t2.startswith("NN")) or s1[j][0] in "\":;)-":
|
|
i += 1
|
|
n += 1
|
|
#print(float(i) / n)
|
|
self.assertTrue(float(i) / n > 0.92)
|
|
print("pattern.it.parser.parse()")
|
|
|
|
def test_tag(self):
|
|
# Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")].
|
|
v = it.tag("il gatto nero")
|
|
self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")])
|
|
print("pattern.it.tag()")
|
|
|
|
def test_command_line(self):
|
|
# Assert parsed output from the command-line (example from the documentation).
|
|
p = ["python", "-m", "pattern.it", "-s", "Il gatto nero.", "-OTCRL"]
|
|
p = subprocess.Popen(p, stdout=subprocess.PIPE)
|
|
p.wait()
|
|
v = p.stdout.read().decode('utf-8')
|
|
v = v.strip()
|
|
self.assertEqual(v, "Il/DT/B-NP/O/O/il gatto/NN/I-NP/O/O/gatto nero/JJ/I-NP/O/O/nero ././O/O/O/.")
|
|
print("python -m pattern.it")
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
def suite():
|
|
suite = unittest.TestSuite()
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection))
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser))
|
|
return suite
|
|
|
|
if __name__ == "__main__":
|
|
|
|
result = unittest.TextTestRunner(verbosity=1).run(suite())
|
|
sys.exit(not result.wasSuccessful())
|