# -*- coding: utf-8 -*- from __future__ import unicode_literals from __future__ import print_function from __future__ import division from builtins import str, bytes, dict, int from builtins import map, zip, filter from builtins import object, range import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import unittest import subprocess from pattern import it from io import open try: PATH = os.path.dirname(os.path.realpath(__file__)) except: PATH = "" #--------------------------------------------------------------------------------------------------- class TestInflection(unittest.TestCase): def setUp(self): pass def test_article(self): # Assert definite and indefinite article inflection. for a, n, g in ( ("il" , "giorno" , it.M), ("l'" , "altro giorno", it.M), ("lo" , "zio" , it.M), ("l'" , "amica" , it.F), ("la" , "nouva amica" , it.F), ("i" , "giapponesi" , it.M + it.PL), ("gli", "italiani" , it.M + it.PL), ("gli", "zii" , it.M + it.PL), ("le" , "zie" , it.F + it.PL)): v = it.article(n, "definite", gender=g) self.assertEqual(a, v) for a, n, g in ( ("uno", "zio" , it.M), ("una", "zia" , it.F), ("un" , "amico", it.M), ("un'", "amica", it.F)): v = it.article(n, "indefinite", gender=g) self.assertEqual(a, v) v = it.referenced("amica", gender="f") self.assertEqual(v, "un'amica") print("pattern.it.article()") print("pattern.it.referenced()") def test_gender(self): # Assert the accuracy of the gender disambiguation algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): g = it.gender(sg) if mf in g and it.PLURAL not in g: i += 1 g = it.gender(pl) if mf in g and it.PLURAL in g: i += 1 n += 2 self.assertTrue(float(i) / n > 0.92) print("pattern.it.gender()") def test_pluralize(self): # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if it.pluralize(sg) == pl: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print("pattern.it.pluralize()") def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if it.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.84) print("pattern.it.singularize()") def test_predicative(self): # Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale"). from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if pos != "j": continue if it.predicative(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.87) print("pattern.it.predicative()") def test_find_lemma(self): # Assert the accuracy of the verb lemmatization algorithm. i, n = 0, 0 r = 0 for v1, v2 in it.inflect.verbs.inflections.items(): if it.inflect.verbs.find_lemma(v1) == v2: i += 1 n += 1 self.assertTrue(float(i) / n > 0.81) print("pattern.it.inflect.verbs.find_lemma()") def test_find_lexeme(self): # Assert the accuracy of the verb conjugation algorithm. i, n = 0, 0 for v, lexeme1 in it.inflect.verbs.infinitives.items(): lexeme2 = it.inflect.verbs.find_lexeme(v) for j in range(len(lexeme2)): if lexeme1[j] == lexeme2[j]: i += 1 n += 1 self.assertTrue(float(i) / n > 0.89) print("pattern.it.inflect.verbs.find_lexeme()") def test_conjugate(self): # Assert different tenses with different conjugations. for (v1, v2, tense) in ( ("essere", "essere", it.INFINITIVE), ("essere", "sono", (it.PRESENT, 1, it.SINGULAR)), ("essere", "sei", (it.PRESENT, 2, it.SINGULAR)), ("essere", "è", (it.PRESENT, 3, it.SINGULAR)), ("essere", "siamo", (it.PRESENT, 1, it.PLURAL)), ("essere", "siete", (it.PRESENT, 2, it.PLURAL)), ("essere", "sono", (it.PRESENT, 3, it.PLURAL)), ("essere", "essendo", (it.PRESENT + it.PARTICIPLE)), ("essere", "stato", (it.PAST + it.PARTICIPLE)), ("essere", "ero", (it.IMPERFECT, 1, it.SINGULAR)), ("essere", "eri", (it.IMPERFECT, 2, it.SINGULAR)), ("essere", "era", (it.IMPERFECT, 3, it.SINGULAR)), ("essere", "eravamo", (it.IMPERFECT, 1, it.PLURAL)), ("essere", "eravate", (it.IMPERFECT, 2, it.PLURAL)), ("essere", "erano", (it.IMPERFECT, 3, it.PLURAL)), ("essere", "fui", (it.PRETERITE, 1, it.SINGULAR)), ("essere", "fosti", (it.PRETERITE, 2, it.SINGULAR)), ("essere", "fu", (it.PRETERITE, 3, it.SINGULAR)), ("essere", "fummo", (it.PRETERITE, 1, it.PLURAL)), ("essere", "foste", (it.PRETERITE, 2, it.PLURAL)), ("essere", "furono", (it.PRETERITE, 3, it.PLURAL)), ("essere", "sarei", (it.CONDITIONAL, 1, it.SINGULAR)), ("essere", "saresti", (it.CONDITIONAL, 2, it.SINGULAR)), ("essere", "sarebbe", (it.CONDITIONAL, 3, it.SINGULAR)), ("essere", "saremmo", (it.CONDITIONAL, 1, it.PLURAL)), ("essere", "sareste", (it.CONDITIONAL, 2, it.PLURAL)), ("essere", "sarebbero", (it.CONDITIONAL, 3, it.PLURAL)), ("essere", "sarò", (it.FUTURE, 1, it.SINGULAR)), ("essere", "sarai", (it.FUTURE, 2, it.SINGULAR)), ("essere", "sarà", (it.FUTURE, 3, it.SINGULAR)), ("essere", "saremo", (it.FUTURE, 1, it.PLURAL)), ("essere", "sarete", (it.FUTURE, 2, it.PLURAL)), ("essere", "saranno", (it.FUTURE, 3, it.PLURAL)), ("essere", "sii", (it.PRESENT, 2, it.SINGULAR, it.IMPERATIVE)), ("essere", "sia", (it.PRESENT, 3, it.SINGULAR, it.IMPERATIVE)), ("essere", "siamo", (it.PRESENT, 1, it.PLURAL, it.IMPERATIVE)), ("essere", "siate", (it.PRESENT, 2, it.PLURAL, it.IMPERATIVE)), ("essere", "siano", (it.PRESENT, 3, it.PLURAL, it.IMPERATIVE)), ("essere", "sia", (it.PRESENT, 1, it.SINGULAR, it.SUBJUNCTIVE)), ("essere", "sia", (it.PRESENT, 2, it.SINGULAR, it.SUBJUNCTIVE)), ("essere", "sia", (it.PRESENT, 3, it.SINGULAR, it.SUBJUNCTIVE)), ("essere", "siamo", (it.PRESENT, 1, it.PLURAL, it.SUBJUNCTIVE)), ("essere", "siate", (it.PRESENT, 2, it.PLURAL, it.SUBJUNCTIVE)), ("essere", "siano", (it.PRESENT, 3, it.PLURAL, it.SUBJUNCTIVE)), ("essere", "fossi", (it.PAST, 1, it.SINGULAR, it.SUBJUNCTIVE)), ("essere", "fossi", (it.PAST, 2, it.SINGULAR, it.SUBJUNCTIVE)), ("essere", "fosse", (it.PAST, 3, it.SINGULAR, it.SUBJUNCTIVE)), ("essere", "fossimo", (it.PAST, 1, it.PLURAL, it.SUBJUNCTIVE)), ("essere", "foste", (it.PAST, 2, it.PLURAL, it.SUBJUNCTIVE)), ("essere", "fossero", (it.PAST, 3, it.PLURAL, it.SUBJUNCTIVE))): self.assertEqual(it.conjugate(v1, tense), v2) print("pattern.it.conjugate()") def test_lexeme(self): # Assert all inflections of "essere". v = it.lexeme("essere") self.assertEqual(v, [ 'essere', 'sono', 'sei', 'è', 'siamo', 'siete', 'essendo', 'fui', 'fosti', 'fu', 'fummo', 'foste', 'furono', 'stato', 'ero', 'eri', 'era', 'eravamo', 'eravate', 'erano', 'sarò', 'sarai', 'sarà', 'saremo', 'sarete', 'saranno', 'sarei', 'saresti', 'sarebbe', 'saremmo', 'sareste', 'sarebbero', 'sii', 'sia', 'siate', 'siano', 'fossi', 'fosse', 'fossimo', 'fossero' ]) print("pattern.it.inflect.lexeme()") def test_tenses(self): # Assert tense recognition. self.assertTrue((it.PRESENT, 3, it.SG) in it.tenses("è")) self.assertTrue("2sg" in it.tenses("sei")) print("pattern.it.tenses()") #--------------------------------------------------------------------------------------------------- class TestParser(unittest.TestCase): def setUp(self): pass def test_find_lemmata(self): # Assert lemmata for nouns, adjectives, verbs and determiners. v = it.parser.find_lemmata([ ["I", "DT"], ["gatti", "NNS"], ["neri", "JJ"], ["seduti", "VB"], ["sul", "IN"], ["tatami", "NN"]]) self.assertEqual(v, [ ["I", "DT", "il"], ["gatti", "NNS", "gatto"], ["neri", "JJ", "nero"], ["seduti", "VB", "sedutare"], ["sul", "IN", "sul"], ["tatami", "NN", "tatami"]]) print("pattern.it.parser.find_lemmata()") def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # "il gatto nero" is a noun phrase, "sulla stuoia" is a prepositional noun phrase. v = it.parser.parse("Il gatto nero seduto sulla stuoia.") self.assertEqual(v, "Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O " + "seduto/VB/B-VP/O " + \ "sulla/IN/B-PP/B-PNP stuoia/NN/B-NP/I-PNP ././O/O" ) # Assert the accuracy of the Italian tagger. i, n = 0, 0 for sentence in open(os.path.join(PATH, "corpora", "tagged-it-wacky.txt")).readlines(): sentence = sentence.strip() s1 = [w.split("/") for w in sentence.split(" ")] s2 = [[w for w, pos in s1]] s2 = it.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): t1 = s1[j][1] t2 = s2[j][1] # WaCKy test set tags plural nouns as "NN", pattern.it as "NNS". # Some punctuation marks are also tagged differently, # but these are not necessarily errors. if t1 == t2 or (t1 == "NN" and t2.startswith("NN")) or s1[j][0] in "\":;)-": i += 1 n += 1 #print(float(i) / n) self.assertTrue(float(i) / n > 0.92) print("pattern.it.parser.parse()") def test_tag(self): # Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")]. v = it.tag("il gatto nero") self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")]) print("pattern.it.tag()") def test_command_line(self): # Assert parsed output from the command-line (example from the documentation). p = ["python", "-m", "pattern.it", "-s", "Il gatto nero.", "-OTCRL"] p = subprocess.Popen(p, stdout=subprocess.PIPE) p.wait() v = p.stdout.read().decode('utf-8') v = v.strip() self.assertEqual(v, "Il/DT/B-NP/O/O/il gatto/NN/I-NP/O/O/gatto nero/JJ/I-NP/O/O/nero ././O/O/O/.") print("python -m pattern.it") #--------------------------------------------------------------------------------------------------- def suite(): suite = unittest.TestSuite() suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestInflection)) suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestParser)) return suite if __name__ == "__main__": result = unittest.TextTestRunner(verbosity=1).run(suite()) sys.exit(not result.wasSuccessful())