|
|
#### PATTERN | IT | INFLECT ########################################################################
|
|
|
# -*- coding: utf-8 -*-
|
|
|
# Copyright (c) 2013 University of Antwerp, Belgium
|
|
|
# Copyright (c) 2013 St. Lucas University College of Art & Design, Antwerp.
|
|
|
# Author: Tom De Smedt <tom@organisms.be>
|
|
|
# License: BSD (see LICENSE.txt for details).
|
|
|
|
|
|
####################################################################################################
|
|
|
# Regular expressions-based rules for Italian word inflection:
|
|
|
# - pluralization and singularization of nouns,
|
|
|
# - conjugation of verbs,
|
|
|
# - predicative adjectives.
|
|
|
|
|
|
# Accuracy:
|
|
|
# 92% for gender()
|
|
|
# 93% for pluralize()
|
|
|
# 84% for singularize()
|
|
|
# 82% for Verbs.find_lemma()
|
|
|
# 90% for Verbs.find_lexeme()
|
|
|
# 88% for predicative()
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
from __future__ import division
|
|
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
from builtins import map, zip, filter
|
|
|
from builtins import object, range
|
|
|
|
|
|
import os
|
|
|
import sys
|
|
|
import re
|
|
|
|
|
|
try:
|
|
|
MODULE = os.path.dirname(os.path.realpath(__file__))
|
|
|
except:
|
|
|
MODULE = ""
|
|
|
|
|
|
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))
|
|
|
|
|
|
# Import Verbs base class and verb tenses.
|
|
|
from pattern.text import Verbs as _Verbs
|
|
|
from pattern.text import (
|
|
|
INFINITIVE, PRESENT, PAST, FUTURE, CONDITIONAL,
|
|
|
FIRST, SECOND, THIRD,
|
|
|
SINGULAR, PLURAL, SG, PL,
|
|
|
INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
|
|
|
IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
|
|
|
IMPERFECT, PRETERITE,
|
|
|
PARTICIPLE, GERUND
|
|
|
)
|
|
|
|
|
|
sys.path.pop(0)
|
|
|
|
|
|
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
|
|
|
|
|
|
VOWELS = "aeiouy"
|
|
|
re_vowel = re.compile(r"a|e|i|o|u|y", re.I)
|
|
|
is_vowel = lambda ch: ch in VOWELS
|
|
|
|
|
|
#### ARTICLE #######################################################################################
|
|
|
|
|
|
# Inflection gender.
|
|
|
MASCULINE, FEMININE, NEUTER, PLURAL = \
|
|
|
MALE, FEMALE, NEUTRAL, PLURAL = \
|
|
|
M, F, N, PL = "m", "f", "n", "p"
|
|
|
|
|
|
# Word starts with z or s + consonant?
|
|
|
zs = lambda w: w and (w[:1] == "z" or (w[:1] == "s" and not is_vowel(w[1:2])))
|
|
|
|
|
|
|
|
|
def definite_article(word, gender=MALE):
|
|
|
""" Returns the definite article for a given word.
|
|
|
"""
|
|
|
if PLURAL in gender and MALE in gender and (is_vowel(word[:1]) or zs(word)):
|
|
|
return "gli"
|
|
|
if PLURAL not in gender and word and is_vowel(word[:1]):
|
|
|
return "l'"
|
|
|
if PLURAL not in gender and MALE in gender and zs(word):
|
|
|
return "lo"
|
|
|
if MALE in gender:
|
|
|
return PLURAL in gender and "i" or "il"
|
|
|
if FEMALE in gender:
|
|
|
return PLURAL in gender and "le" or "la"
|
|
|
return "il"
|
|
|
|
|
|
|
|
|
def indefinite_article(word, gender=MALE):
|
|
|
""" Returns the indefinite article for a given word.
|
|
|
"""
|
|
|
if MALE in gender and zs(word):
|
|
|
return PLURAL in gender and "degli" or "uno"
|
|
|
if MALE in gender:
|
|
|
return PLURAL in gender and "dei" or "un"
|
|
|
if FEMALE in gender and is_vowel(word[:1]):
|
|
|
return PLURAL in gender and "delle" or "un'"
|
|
|
if FEMALE in gender:
|
|
|
return PLURAL in gender and "delle" or "una"
|
|
|
return "un"
|
|
|
|
|
|
DEFINITE, INDEFINITE = \
|
|
|
"definite", "indefinite"
|
|
|
|
|
|
|
|
|
def article(word, function=INDEFINITE, gender=MALE):
|
|
|
""" Returns the indefinite or definite article for the given word.
|
|
|
"""
|
|
|
return function == DEFINITE \
|
|
|
and definite_article(word, gender) \
|
|
|
or indefinite_article(word, gender)
|
|
|
|
|
|
_article = article
|
|
|
|
|
|
|
|
|
def referenced(word, article=INDEFINITE, gender=MALE):
|
|
|
""" Returns a string with the article + the word.
|
|
|
"""
|
|
|
s = "%s&space;%s" % (_article(word, article, gender), word)
|
|
|
s = s.replace("'&space;", "'")
|
|
|
s = s.replace("&space;", " ")
|
|
|
return s
|
|
|
|
|
|
#### GENDER #########################################################################################
|
|
|
|
|
|
|
|
|
def gender(word):
|
|
|
""" Returns the gender for the given word, either:
|
|
|
MALE, FEMALE, (MALE, FEMALE), (MALE, PLURAL) or (FEMALE, PLURAL).
|
|
|
"""
|
|
|
w = word.lower()
|
|
|
# Adjectives ending in -e: cruciale, difficile, ...
|
|
|
if w.endswith(("ale", "ile", "ese", "nte")):
|
|
|
return (MALE, FEMALE)
|
|
|
# Most nouns ending in -a (-e) are feminine, -o (-i) masculine:
|
|
|
if w.endswith(("ore", "ista", "mma")):
|
|
|
return MALE
|
|
|
if w.endswith(("a", "tà", "tù", "ione", "rice")):
|
|
|
return FEMALE
|
|
|
if w.endswith(("e", "oni")):
|
|
|
return (FEMALE, PLURAL)
|
|
|
if w.endswith("i"):
|
|
|
return (MALE, PLURAL)
|
|
|
if w.endswith("o"):
|
|
|
return MALE
|
|
|
return MALE
|
|
|
|
|
|
#### PLURALIZE ######################################################################################
|
|
|
|
|
|
plural_co_chi = set((
|
|
|
"abbaco", "baco", "cuoco", "fungo", "rammarico", "strascio", "valico" # ...
|
|
|
))
|
|
|
|
|
|
plural_go_ghi = set((
|
|
|
"albergo", "catalogo", "chirurgo", "dialogo", "manico", "monologo", "stomaco" # ...
|
|
|
))
|
|
|
|
|
|
plural_irregular = {
|
|
|
"braccio": "braccia", # bracci (arms of a lamp or cross)
|
|
|
"budello": "budelli", # budella (intestines)
|
|
|
"camicia": "camicie",
|
|
|
"bue": "buoi",
|
|
|
"dio": "dei",
|
|
|
"dito": "dita",
|
|
|
"doccia": "docce",
|
|
|
"inizio": "inizi",
|
|
|
"labbro": "labbra", # labbri (borders)
|
|
|
"mano": "mani",
|
|
|
"negozio": "negozi",
|
|
|
"osso": "ossa", # ossi (dog bones)
|
|
|
"uomo": "uomini",
|
|
|
"uovo": "uova"
|
|
|
}
|
|
|
|
|
|
|
|
|
def pluralize(word, pos=NOUN, custom={}):
|
|
|
""" Returns the plural of a given word.
|
|
|
"""
|
|
|
if word in custom:
|
|
|
return custom[word]
|
|
|
w = word.lower()
|
|
|
if len(w) < 3:
|
|
|
return w
|
|
|
if w in plural_irregular:
|
|
|
return plural_irregular[w]
|
|
|
# provincia => province (but: socia => socie)
|
|
|
if w.endswith(("cia", "gia")) and len(w) > 4 and not is_vowel(w[-4]):
|
|
|
return w[:-2] + "e"
|
|
|
# amica => amiche
|
|
|
if w.endswith(("ca", "ga")):
|
|
|
return w[:-2] + "he"
|
|
|
# studentessa => studentesse
|
|
|
if w.endswith("a"):
|
|
|
return w[:-1] + "e"
|
|
|
# studente => studenti
|
|
|
if w.endswith("e"):
|
|
|
return w[:-1] + "i"
|
|
|
# viaggio => viaggi (but: leggìo => leggìi)
|
|
|
if w.endswith("io"):
|
|
|
return w[:-2] + "i"
|
|
|
# abbaco => abbachi
|
|
|
if w in plural_co_chi:
|
|
|
return w[:-2] + "chi"
|
|
|
# albergo => alberghi
|
|
|
if w in plural_co_chi:
|
|
|
return w[:-2] + "ghi"
|
|
|
# amico => amici
|
|
|
if w.endswith("o"):
|
|
|
return w[:-1] + "i"
|
|
|
return w
|
|
|
|
|
|
#### SINGULARIZE ###################################################################################
|
|
|
|
|
|
singular_majority_vote = [
|
|
|
("tenti", "tente"), ("anti", "ante"), ( "oni", "one" ), ( "nti", "nto" ),
|
|
|
( "ali", "ale" ), ( "ici", "ico" ), ( "nze", "nza" ), ( "ori", "ore" ),
|
|
|
( "che", "ca" ), ( "ati", "ato" ), ( "ari", "ario"), ( "tti", "tto" ),
|
|
|
( "eri", "ero" ), ( "chi", "co" ), ( "ani", "ano" ), ( "ure", "ura" ),
|
|
|
( "ità", "ità" ), ( "ivi", "ivo" ), ( "ini", "ino" ), ( "iti", "ito" ),
|
|
|
( "emi", "ema" ), ( "ili", "ile" ), ( "oli", "olo" ), ( "esi", "ese" ),
|
|
|
( "ate", "ata" ), ( "ssi", "sso" ), ( "rie", "ria" ), ( "ine", "ina" ),
|
|
|
( "lli", "llo" ), ( "ggi", "ggio"), ( "tri", "tro" ), ( "imi", "imo" )
|
|
|
]
|
|
|
|
|
|
singular_irregular = dict((v, k) for k, v in plural_irregular.items())
|
|
|
|
|
|
|
|
|
def singularize(word, pos=NOUN, custom={}):
|
|
|
""" Returns the singular of a given word.
|
|
|
"""
|
|
|
if word in custom:
|
|
|
return custom[word]
|
|
|
w = word.lower()
|
|
|
# il gatti => il gatto
|
|
|
if pos == "DT":
|
|
|
if w in ("i", "gli"):
|
|
|
return "il"
|
|
|
if w == "el":
|
|
|
return "la"
|
|
|
return w
|
|
|
if len(w) < 3:
|
|
|
return w
|
|
|
if w in singular_irregular:
|
|
|
return singular_irregular[w]
|
|
|
# Ruleset adds 16% accuracy.
|
|
|
for a, b in singular_majority_vote:
|
|
|
if w.endswith(a):
|
|
|
return w[:-len(a)] + b
|
|
|
# Probably an adjective ending in -e: cruciale, difficile, ...
|
|
|
if w.endswith(("ali", "ari", "ili", "esi", "nti")):
|
|
|
return w[:-1] + "e"
|
|
|
# realisti => realista
|
|
|
if w.endswith("isti"):
|
|
|
return w[:-1] + "a"
|
|
|
# amiche => amica
|
|
|
if w.endswith(("che", "ghe")):
|
|
|
return w[:-2] + "a"
|
|
|
# alberghi => albergo
|
|
|
if w.endswith(("chi", "ghi")):
|
|
|
return w[:-2] + "o"
|
|
|
# problemi => problema
|
|
|
if w.endswith("emi"):
|
|
|
return w[:-1] + "a"
|
|
|
# case => case
|
|
|
if w.endswith("e"):
|
|
|
return w[:-1] + "a"
|
|
|
# Ambigious: both -o and -a pluralize to -i.
|
|
|
if w.endswith("i"):
|
|
|
return w[:-1] + "o"
|
|
|
return w
|
|
|
|
|
|
#### VERB CONJUGATION ##############################################################################
|
|
|
# The verb table was trained on Wiktionary and contains the top 1,250 frequent verbs.
|
|
|
|
|
|
verb_majority_vote = [
|
|
|
("iresti", "ire" ), ("ireste", "ire" ), ("iremmo", "ire" ), ("irebbe", "ire" ),
|
|
|
("iranno", "ire" ), ( "ssero", "re" ), ( "ssimo", "re" ), ( "ivate", "ire" ),
|
|
|
( "ivamo", "ire" ), ( "irete", "ire" ), ( "iremo", "ire" ), ( "irono", "ire" ),
|
|
|
( "scano", "re" ), ( "hiamo", "are" ), ( "scono", "re" ), ( "hiate", "are" ),
|
|
|
( "vano", "re" ), ( "vate", "re" ), ( "vamo", "re" ), ( "simo", "e" ),
|
|
|
( "rono", "re" ), ( "isse", "ire" ), ( "isti", "ire" ), ( "tino", "tare"),
|
|
|
( "tato", "tare"), ( "irai", "ire" ), ( "tavo", "tare"), ( "tavi", "tare"),
|
|
|
( "tava", "tare"), ( "tate", "tare"), ( "iste", "ire" ), ( "irei", "ire" ),
|
|
|
( "immo", "ire" ), ( "rerò", "rare"), ( "rerà", "rare"), ( "iavo", "iare"),
|
|
|
( "iavi", "iare"), ( "iava", "iare"), ( "iato", "iare"), ( "iare", "iare"),
|
|
|
( "hino", "are" ), ( "ssi", "re" ), ( "sse", "re" ), ( "ndo", "re" ),
|
|
|
( "irò", "ire" ), ( "tai", "tare"), ( "ite", "ire" ), ( "irà", "ire" ),
|
|
|
( "sco", "re" ), ( "sca", "re" ), ( "iai", "iare"), ( "ii", "ire" ),
|
|
|
( "hi", "are" )
|
|
|
]
|
|
|
|
|
|
|
|
|
class Verbs(_Verbs):
|
|
|
|
|
|
def __init__(self):
|
|
|
_Verbs.__init__(self, os.path.join(MODULE, "it-verbs.txt"),
|
|
|
language = "it",
|
|
|
default = {},
|
|
|
format = [
|
|
|
0, 1, 2, 3, 4, 5, 6, 8, # indicativo presente
|
|
|
34, 35, 36, 37, 38, 39, 24, # indicativo passato remoto
|
|
|
17, 18, 19, 20, 21, 22, # indicativo imperfetto
|
|
|
40, 41, 42, 43, 44, 45, # indicativo futuro semplice
|
|
|
46, 47, 48, 49, 50, 51, # condizionale presente
|
|
|
52, 521, 53, 54, 541, # imperativo
|
|
|
55, 56, 57, 58, 59, 60, # congiuntivo presente
|
|
|
67, 68, 69, 70, 71, 72 # congiontive imperfetto
|
|
|
])
|
|
|
|
|
|
def find_lemma(self, verb):
|
|
|
""" Returns the base form of the given inflected verb, using a rule-based approach.
|
|
|
"""
|
|
|
v = verb.lower()
|
|
|
# Probably infinitive if ends in -are, -ere, -ire or reflexive -rsi.
|
|
|
if v.endswith(("are", "ere", "ire", "rsi")):
|
|
|
return v
|
|
|
# Ruleset adds 3% accuracy.
|
|
|
for a, b in verb_majority_vote:
|
|
|
if v.endswith(a):
|
|
|
return v[:-len(a)] + b
|
|
|
v = v.replace("cha", "ca")
|
|
|
v = v.replace("che", "ce")
|
|
|
v = v.replace("gha", "ga")
|
|
|
v = v.replace("ghe", "ge")
|
|
|
v = v.replace("ghi", "gi")
|
|
|
v = v.replace("gge", "ggie")
|
|
|
# Many verbs end in -ire and have a regular inflection:
|
|
|
for x in ((
|
|
|
"irò", "irai", "irà", "iremo", "irete", "iranno", # future
|
|
|
"irei", "iresti", "irebbe", "iremmo", "ireste", "irebbero", # conditional
|
|
|
"ascano", # subjunctive I
|
|
|
"issi", "isse", "issimo", "iste", "issero", # subjunctive II
|
|
|
"ivo", "ivi", "iva", "ivamo", "ivate", "ivano", # past imperfective
|
|
|
"isti", "immo", "iste", "irono", "ito", # past perfective
|
|
|
"isco", "isci", "isce", "ite", "iscono", "indo")): # present
|
|
|
if v.endswith(x):
|
|
|
return v[:-len(x)] + "ire"
|
|
|
# Many verbs end in -are and have a regular inflection:
|
|
|
for x in ((
|
|
|
"erò", "erai", "erà", "eremo", "erete", "eranno", # future
|
|
|
"erei", "eresti", "erebbe", "eremmo", "ereste", "erebbero", # conditional
|
|
|
"iamo", "iate", "ino", # subjunctive I
|
|
|
"assi", "asse", "assimo", "aste", "assero", # subjunctive II
|
|
|
"avo", "avi", "ava", "avamo", "avate", "avano", # past imperfective
|
|
|
"ai", "asti", "ò", "ammo", "aste", "arono", "ato", # past perfective
|
|
|
"iamo", "ate", "ano", "ando")): # present
|
|
|
if v.endswith(x):
|
|
|
return v[:-len(x)] + "are"
|
|
|
# Many verbs end in -ere and have a regular inflection:
|
|
|
for x in ((
|
|
|
"essi", "esse", "essimo", "este", "essero", # subjunctive II
|
|
|
"evo", "evi", "eva", "evamo", "evate", "evano", # past imperfective
|
|
|
"ei", "esti", "è", "emmo", "este", "erono", "eto", # past perfective
|
|
|
"ete", "ono", "endo")): # present
|
|
|
if v.endswith(x):
|
|
|
return v[:-len(x)] + "ere"
|
|
|
if v.endswith("à"):
|
|
|
return v[:-1] + "e"
|
|
|
if v.endswith("ì"):
|
|
|
return v[:-1] + "ire"
|
|
|
if v.endswith("e"):
|
|
|
return v[:-1] + "ere"
|
|
|
if v.endswith(("a", "i", "o")):
|
|
|
return v[:-1] + "are"
|
|
|
return v
|
|
|
|
|
|
def find_lexeme(self, verb):
|
|
|
""" For a regular verb (base form), returns the forms using a rule-based approach.
|
|
|
"""
|
|
|
v = verb.lower()
|
|
|
v = re.sub(r"rci$", "re", v)
|
|
|
v = re.sub(r"rsi$", "re", v)
|
|
|
v = re.sub(r"rre$", "re", v)
|
|
|
b = v[:-3]
|
|
|
if verb.endswith(("care", "gare")):
|
|
|
b += "h" # moltiplicare => tu moltiplichi
|
|
|
if verb.endswith(("ciare", "giare")):
|
|
|
b = b[:-1] # cominciare => tu cominci
|
|
|
if v.endswith("are"):
|
|
|
# -are = 1st conjugation
|
|
|
a1, a2, a3, a4, a5, a6, a7 = "a", "a", "ò", "a", "i", "e", "a"
|
|
|
elif v.endswith("ere"):
|
|
|
# -ere = 2nd conjugation
|
|
|
a1, a2, a3, a4, a5, a6, a7 = "e", "o", "è", "i", "a", "e", "e"
|
|
|
elif v.endswith("ire"):
|
|
|
# -ire = 3rd conjugation
|
|
|
a1, a2, a3, a4, a5, a6, a7 = "i", "o", "i", "i", "a", "i", "e"
|
|
|
else:
|
|
|
# -orre, -urre = use 2nd conjugation
|
|
|
a1, a2, a3, a4, a5, a6, a7 = "e", "o", "è", "i", "a", "e", "e"
|
|
|
if verb.lower().endswith("ire"):
|
|
|
# –ire verbs can add -isc between the root and declination.
|
|
|
isc = "isc"
|
|
|
else:
|
|
|
isc = ""
|
|
|
v = [verb.lower(),
|
|
|
b + isc + "o", b + isc + "i", b + isc + a7, b + "iamo", b + a1 + "te", b + isc + a2 + "no", b + a1 + "ndo",
|
|
|
b + a1 + "i", b + a1 + "sti", b + a3, b + a1 + "mmo", b + a1 + "ste", b + a1 + "rono", b + a1 + "to",
|
|
|
b + a1 + "vo", b + a1 + "vi", b + a1 + "va", b + a1 + "vamo", b + a1 + "vate", b + a1 + "vano",
|
|
|
b + a6 + "rò", b + a6 + "rai", b + a6 + "rà", b + a6 + "remo", b + a6 + "rete", b + a6 + "ranno",
|
|
|
b + a6 + "rei", b + a6 + "resti", b + a6 + "rebbe", b + a6 + "remmo", b + a6 + "reste", b + a6 + "rebbero",
|
|
|
b + isc + a4, b + isc + a5, b + "iamo", b + a1 + "te", b + isc + a5 + "no",
|
|
|
b + isc + a5, b + isc + a5, b + isc + a5, b + "iamo", b + "iate", b + isc + a5 + "no",
|
|
|
b + a1 + "ssi", b + a1 + "ssi", b + a1 + "sse", b + a1 + "ssimo", b + a1 + "ste", b + a1 + "ssero"
|
|
|
]
|
|
|
for i, x in enumerate(v):
|
|
|
x = x.replace("ii" , "i")
|
|
|
x = x.replace("cha", "ca")
|
|
|
x = x.replace("gha", "ga")
|
|
|
x = x.replace("gga", "ggia")
|
|
|
x = x.replace("cho", "co")
|
|
|
x = x.replace("chò", "cò")
|
|
|
v[i] = x
|
|
|
return v
|
|
|
|
|
|
verbs = Verbs()
|
|
|
|
|
|
conjugate, lemma, lexeme, tenses = \
|
|
|
verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses
|
|
|
|
|
|
#### ATTRIBUTIVE & PREDICATIVE #####################################################################
|
|
|
|
|
|
adjective_predicative = {
|
|
|
"bei": "bello",
|
|
|
"bel": "bello",
|
|
|
"bell'": "bello",
|
|
|
"begli": "bello",
|
|
|
"buon": "buono",
|
|
|
"buon'": "buona",
|
|
|
"gran": "grande",
|
|
|
"grand'": "grande",
|
|
|
"grandi": "grande",
|
|
|
"san": "santo",
|
|
|
"sant'": "santa"
|
|
|
}
|
|
|
|
|
|
|
|
|
def attributive(adjective):
|
|
|
""" For a predicative adjective, returns the attributive form.
|
|
|
"""
|
|
|
# Must deal with feminine and plural.
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
def predicative(adjective):
|
|
|
""" Returns the predicative adjective.
|
|
|
"""
|
|
|
w = adjective.lower()
|
|
|
if w in adjective_predicative:
|
|
|
return adjective_predicative[w]
|
|
|
if w.endswith("ari"):
|
|
|
return w + "o"
|
|
|
if w.endswith(("ali", "ili", "esi", "nti", "ori")):
|
|
|
return w[:-1] + "e"
|
|
|
if w.endswith("isti"):
|
|
|
return w[:-1] + "a"
|
|
|
if w.endswith(("che", "ghe")):
|
|
|
return w[:-2] + "a"
|
|
|
if w.endswith(("chi", "ghi")):
|
|
|
return w[:-2] + "o"
|
|
|
if w.endswith("i"):
|
|
|
return w[:-1] + "o"
|
|
|
if w.endswith("e"):
|
|
|
return w[:-1] + "a"
|
|
|
return adjective
|