bo-graduation/nltk-book/pattern-master/pattern/text/es/inflect.py

#### PATTERN | ES | INFLECT ########################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2012 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).

####################################################################################################
# Regular expressions-based rules for Spanish word inflection:
# - pluralization and singularization of nouns,
# - conjugation of verbs,
# - predicative adjectives.

# Accuracy:
# 78% for pluralize()
# 94% for singularize()
# 81% for Verbs.find_lemma() (0.55 regular 87% + 0.45 irregular 74%)
# 87% for Verbs.find_lexeme() (0.55 regular 99% + 0.45 irregular 72%)
# 93% for predicative()

from __future__ import unicode_literals
from __future__ import division

from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range

import os
import sys
import re

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))

from pattern.text import Verbs as _Verbs
from pattern.text import (
    INFINITIVE, PRESENT, PAST, FUTURE, CONDITIONAL,
    FIRST, SECOND, THIRD,
    SINGULAR, PLURAL, SG, PL,
    INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
    IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
    IMPERFECT, PRETERITE,
    PARTICIPLE, GERUND
)

sys.path.pop(0)

VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"

VOWELS = ("a", "e", "i", "o", "u")
re_vowel = re.compile(r"a|e|i|o|u", re.I)
is_vowel = lambda ch: ch in VOWELS


def normalize(vowel):
    return {"á": "a", "é": "e", "í": "i", "ó": "o", "ú": "u"}.get(vowel, vowel)

#### ARTICLE #######################################################################################
# Spanish inflection of depends on gender and number.

# Inflection gender.
MASCULINE, FEMININE, NEUTER, PLURAL = \
    MALE, FEMALE, NEUTRAL, PLURAL = \
        M, F, N, PL = "m", "f", "n", "p"


def definite_article(word, gender=MALE):
    """ Returns the definite article (el/la/los/las) for a given word.
    """
    if MASCULINE in gender:
        return PLURAL in gender and "los" or "el"
    return PLURAL in gender and "las" or "la"


def indefinite_article(word, gender=MALE):
    """ Returns the indefinite article (un/una/unos/unas) for a given word.
    """
    if MASCULINE in gender:
        return PLURAL in gender and "unos" or "un"
    return PLURAL in gender and "unas" or "una"

DEFINITE = "definite"
INDEFINITE = "indefinite"


def article(word, function=INDEFINITE, gender=MALE):
    """ Returns the indefinite (un) or definite (el) article for the given word.
    """
    return function == DEFINITE \
       and definite_article(word, gender) \
        or indefinite_article(word, gender)
_article = article


def referenced(word, article=INDEFINITE, gender=MALE):
    """ Returns a string with the article + the word.
    """
    return "%s %s" % (_article(word, article, gender), word)

#### PLURALIZE #####################################################################################

plural_irregular = {
     "mamá": "mamás",
     "papá": "papás",
     "sofá": "sofás",
   "dominó": "dominós",
}


def pluralize(word, pos=NOUN, custom={}):
    """ Returns the plural of a given word.
        For example: gato => gatos.
        The custom dictionary is for user-defined replacements.
    """
    if word in custom:
        return custom[word]
    w = word.lower()
    # Article: masculine el => los, feminine la => las.
    if w == "el":
        return "los"
    if w == "la":
        return "las"
    # Irregular inflections.
    if w in plural_irregular:
        return plural_irregular[w]
    # Words endings that are unlikely to inflect.
    if w.endswith((
      "idad",
      "esis", "isis", "osis",
      "dica", "grafía", "logía")):
        return w
    # Words ending in a vowel get -s: gato => gatos.
    if w.endswith(VOWELS) or w.endswith("é"):
        return w + "s"
    # Words ending in a stressed vowel get -s: hindú => hindúes.
    if w.endswith(("á", "é", "í", "ó", "ú")):
        return w + "es"
    # Words ending in -és get -eses: holandés => holandeses.
    if w.endswith("és"):
        return w[:-2] + "eses"
    # Words ending in -s preceded by an unstressed vowel: gafas => gafas.
    if w.endswith("s") and len(w) > 3 and is_vowel(w[-2]):
        return w
    # Words ending in -z get -ces: luz => luces
    if w.endswith("z"):
        return w[:-1] + "ces"
    # Words that change vowel stress: graduación => graduaciones.
    for a, b in (
      ("án", "anes"),
      ("én", "enes"),
      ("ín", "ines"),
      ("ón", "ones"),
      ("ún", "unes")):
        if w.endswith(a):
            return w[:-2] + b
    # Words ending in a consonant get -es.
    return w + "es"

#print(pluralize("libro"))  # libros
#print(pluralize("señor"))  # señores
#print(pluralize("ley"))    # leyes
#print(pluralize("mes"))    # meses
#print(pluralize("luz"))    # luces
#print(pluralize("inglés")) # ingleses
#print(pluralize("rubí"))   # rubíes
#print(pluralize("papá"))   # papás

#### SINGULARIZE ###################################################################################


def singularize(word, pos=NOUN, custom={}):
    if word in custom:
        return custom[word]
    w = word.lower()
    # los gatos => el gato
    if pos == "DT":
        if w in ("la", "las", "los"):
            return "el"
        if w in ("una", "unas", "unos"):
            return "un"
        return w
    # hombres => hombre
    if w.endswith("es") and w[:-2].endswith(("br", "i", "j", "t", "zn")):
        return w[:-1]
    # gestiones => gestión
    for a, b in (
      ("anes", "án"),
      ("enes", "én"),
      ("eses", "és"),
      ("ines", "ín"),
      ("ones", "ón"),
      ("unes", "ún")):
        if w.endswith(a):
            return w[:-4] + b
    # hipotesis => hipothesis
    if w.endswith(("esis", "isis", "osis")):
        return w
    # luces => luz
    if w.endswith("ces"):
        return w[:-3] + "z"
    # hospitales => hospital
    if w.endswith("es"):
        return w[:-2]
    # gatos => gato
    if w.endswith("s"):
        return w[:-1]
    return w

#### VERB CONJUGATION ##############################################################################

verb_irregular_inflections = [
    ( "yéramos", "ir"   ), ( "cisteis", "cer"   ), ( "tuviera", "tener"), ( "ndieron", "nder" ),
    ( "ndiendo", "nder" ), ( "tándose", "tarse" ), ( "ndieran", "nder" ), ( "ndieras", "nder" ),
    ( "izaréis", "izar" ), ( "disteis", "der"   ), ( "irtiera", "ertir"), ( "pusiera", "poner"),
    ( "endiste", "ender"), ( "laremos", "lar"   ), ( "ndíamos", "nder" ), ( "icaréis", "icar" ),
    ( "dábamos", "dar"  ), ( "intiera", "entir" ), ( "iquemos", "icar" ), ( "jéramos", "cir"  ),
    ( "dierais", "der"  ), ( "endiera", "ender" ), ( "iéndose", "erse" ), ( "jisteis", "cir"  ),
    ( "cierais", "cer"  ), ( "ecíamos", "ecer"  ), ( "áramos", "ar"    ), ( "ríamos", "r"     ),
    ( "éramos", "r"     ), ( "iríais", "ir"     ), (   "temos", "tar"  ), (   "steis", "r"    ),
    (   "ciera", "cer"  ), (   "erais", "r"     ), (   "timos", "tir"  ), (   "uemos", "ar"   ),
    (   "tiera", "tir"  ), (   "bimos", "bir"   ), (  "ciéis", "ciar"  ), (   "gimos", "gir"  ),
    (   "jiste", "cir"  ), (   "mimos", "mir"   ), (  "guéis", "gar"   ), (  "stéis", "star"  ),
    (   "jimos", "cir"  ), (  "inéis", "inar"   ), (   "jemos", "jar"  ), (   "tenga", "tener"),
    (  "quéis", "car"   ), (  "bíais", "bir"    ), (   "jeron", "cir"  ), (  "uíais", "uir"   ),
    (  "ntéis", "ntar"  ), (   "jeras", "cir"   ), (   "jeran", "cir"  ), (  "ducía", "ducir" ),
    (   "yendo", "ir"   ), (   "eemos", "ear"   ), (   "ierta", "ertir"), (   "ierte", "ertir"),
    (   "nemos", "nar"  ), (  "ngáis", "ner"    ), (   "liera", "ler"  ), (  "endió", "ender" ),
    (  "uyáis", "uir"   ), (   "memos", "mar"   ), (   "ciste", "cer"  ), (   "ujera", "ucir" ),
    (   "uimos", "uir"  ), (   "ienda", "ender" ), (  "lléis", "llar"  ), (   "iemos", "iar"  ),
    (   "iende", "ender"), (   "rimos", "rir"   ), (   "semos", "sar"  ), (  "itéis", "itar"  ),
    (  "gíais", "gir"   ), (  "ndáis", "nder"   ), (  "tíais", "tir"   ), (   "demos", "dar"  ),
    (   "lemos", "lar"  ), (   "ponga", "poner" ), (   "yamos", "ir"   ), (  "icéis", "izar"  ),
    (    "bais", "r"    ), (   "rías", "r"      ), (   "rían", "r"     ), (   "iría", "ir"    ),
    (    "eran", "r"    ), (    "eras", "r"     ), (   "irán", "ir"    ), (   "irás", "ir"    ),
    (    "ongo", "oner" ), (    "aiga", "aer"   ), (   "ímos", "ir"    ), (   "ibía", "ibir"  ),
    (    "diga", "decir"), (   "edía", "edir"   ), (    "orte", "ortar"), (   "guió", "guir"  ),
    (    "iega", "egar" ), (    "oren", "orar"  ), (    "ores", "orar" ), (   "léis", "lar"   ),
    (    "irme", "irmar"), (    "siga", "seguir"), (   "séis", "sar"   ), (   "stré", "strar" ),
    (    "cien", "ciar" ), (    "cies", "ciar"  ), (    "dujo", "ducir"), (    "eses", "esar" ),
    (    "esen", "esar" ), (    "coja", "coger" ), (    "lice", "lizar"), (   "tías", "tir"   ),
    (   "tían", "tir"   ), (    "pare", "parar" ), (    "gres", "grar" ), (    "gren", "grar" ),
    (    "tuvo", "tener"), (   "uían", "uir"    ), (   "uías", "uir"   ), (    "quen", "car"  ),
    (    "ques", "car"  ), (   "téis", "tar"    ), (    "iero", "erir" ), (    "iere", "erir" ),
    (    "uche", "uchar"), (    "tuve", "tener" ), (    "inen", "inar" ), (    "pire", "pirar"),
    (   "reía", "reir"  ), (    "uste", "ustar" ), (   "ibió", "ibir"  ), (    "duce", "ducir"),
    (    "icen", "izar" ), (    "ices", "izar"  ), (    "ines", "inar" ), (    "ires", "irar" ),
    (    "iren", "irar" ), (    "duje", "ducir" ), (    "ille", "illar"), (    "urre", "urrir"),
    (    "tido", "tir"  ), (   "ndió", "nder"   ), (    "uido", "uir"  ), (    "uces", "ucir" ),
    (    "ucen", "ucir" ), (   "iéis", "iar"    ), (   "eció", "ecer"  ), (   "jéis", "jar"   ),
    (    "erve", "ervar"), (    "uyas", "uir"   ), (    "uyan", "uir"  ), (    "tía", "tir"   ),
    (    "uía", "uir"   ), (     "aos", "arse"  ), (     "gue", "gar"  ), (    "qué", "car"   ),
    (     "que", "car"  ), (     "rse", "rse"   ), (     "ste", "r"    ), (     "era", "r"    ),
    (    "tió", "tir"   ), (     "ine", "inar"  ), (     "ré", "r"     ), (      "ya", "ir"   ),
    (      "ye", "ir"   ), (     "tí", "tir"    ), (     "cé", "zar"   ), (      "ie", "iar"  ),
    (      "id", "ir"   ), (     "ué", "ar"     ),
]


class Verbs(_Verbs):

    def __init__(self):
        _Verbs.__init__(self, os.path.join(MODULE, "es-verbs.txt"),
            language = "es",
             default = {},
              format = [
                0, 1, 2, 3, 4, 5, 6, 8,     # indicativo presente
                34, 35, 36, 37, 38, 39, 24, # indicativo pretérito
                17, 18, 19, 20, 21, 22,     # indicativo imperfecto
                40, 41, 42, 43, 44, 45,     # indicativo futuro
                46, 47, 48, 49, 50, 51,     # indicativo condicional
                52, 54,                     # imperativo afirmativo
                55, 56, 57, 58, 59, 60,     # subjuntivo presente
                67, 68, 69, 70, 71, 72      # subjuntivo imperfecto
            ])

    def find_lemma(self, verb):
        """ Returns the base form of the given inflected verb, using a rule-based approach.
        """
        # Spanish has 12,000+ verbs, ending in -ar (85%), -er (8%), -ir (7%).
        # Over 65% of -ar verbs (6500+) have a regular inflection.
        v = verb.lower()
        # Probably ends in -ir if preceding vowel in stem is -i.
        er_ir = lambda b: (len(b) > 2 and b[-2] == "i") and b + "ir" or b + "er"
        # Probably infinitive if ends in -ar, -er or -ir.
        if v.endswith(("ar", "er", "ir")):
            return v
        # Ruleset for irregular inflections adds 10% accuracy.
        for a, b in verb_irregular_inflections:
            if v.endswith(a):
                return v[:-len(a)] + b
        # reconozco => reconocer
        v = v.replace("zco", "ce")
        # reconozcamos => reconocer
        v = v.replace("zca", "ce")
        # reconozcáis => reconocer
        v = v.replace("zcá", "ce")
        # saldrár => saler
        if "ldr" in v:
            return v[:v.index("ldr") + 1] + "er"
        # compondrán => componer
        if "ndr" in v:
            return v[:v.index("ndr") + 1] + "er"
        # Many verbs end in -ar and have a regular inflection:
        for x in ((
          "ando", "ado", "ad",                                # participle
          "aré", "arás", "ará", "aremos", "aréis", "arán", # future
          "aría", "arías", "aríamos", "aríais", "arían",    # conditional
          "aba", "abas", "ábamos", "abais", "aban",         # past imperfective
          "é", "aste", "ó", "asteis", "aron",               # past perfective
          "ara", "aras", "áramos", "arais", "aran")):       # past subjunctive
            if v.endswith(x):
                return v[:-len(x)] + "ar"
        # Many verbs end in -er and have a regular inflection:
        for x in ((
          "iendo", "ido", "ed",                               # participle
          "eré", "erás", "erá", "eremos", "eréis", "erán", # future
          "ería", "erías", "eríamos", "eríais", "erían",    # conditional
          "ía", "ías", "íamos", "íais", "ían",              # past imperfective
          "í", "iste", "ió", "imos", "isteis", "ieron",        # past perfective
          "era", "eras", "éramos", "erais", "eran")):       # past subjunctive
            if v.endswith(x):
                return er_ir(v[:-len(x)])
        # Many verbs end in -ir and have a regular inflection:
        for x in ((
          "iré", "irás", "irá", "iremos", "iréis", "irán", # future
          "iría", "irías", "iríamos", "iríais", "irían")):  # past subjunctive
            if v.endswith(x):
                return v[:-len(x)] + "ir"
        # Present 1sg -o: yo hablo, como, vivo => hablar, comer, vivir.
        if v.endswith("o"):
            return v[:-1] + "ar"
        # Present 2sg, 3sg and 3pl: tú hablas.
        if v.endswith(("as", "a", "an")):
            return v.rstrip("sn")[:-1] + "ar"
        # Present 2sg, 3sg and 3pl: tú comes, tú vives.
        if v.endswith(("es", "e", "en")):
            return er_ir(v.rstrip("sn")[:-1])
        # Present 1pl and 2pl: nosotros hablamos.
        for i, x in enumerate((
          ("amos", "áis"),
          ("emos", "éis"),
          ("imos", "ís"))):
            for x in x:
                if v.endswith(x):
                    return v[:-len(x)] + ("ar", "er", "ir")[i]
        return v

    def find_lexeme(self, verb):
        """ For a regular verb (base form), returns the forms using a rule-based approach.
        """
        v = verb.lower()
        if v.endswith(("arse", "erse", "irse")):
            # Reflexive verbs: calmarse (calmar) => me calmo.
            b = v[:-4]
        else:
            b = v[:-2]
        if v.endswith("ar") or not v.endswith(("er", "ir")):
            # Regular inflection for verbs ending in -ar.
            return [v,
                b + "o", b + "as", b + "a", b + "amos", b + "áis", b + "an", b + "ando",
                b + "é", b + "aste", b + "ó", b + "amos", b + "asteis", b + "aron", b + "ado",
                b + "aba", b + "abas", b + "aba", b + "ábamos", b + "abais", b + "aban",
                v + "é", v + "ás", v + "á", v + "emos", v + "éis", v + "án",
                v + "ía", v + "ías", v + "ía", v + "íamos", v + "íais", v + "ían",
                b + "a", v[:-1] + "d",
                b + "e", b + "es", b + "e", b + "emos", b + "éis", b + "en",
                v + "a", v + "as", v + "a", b + "áramos", v + "ais", v + "an"]
        else:
            # Regular inflection for verbs ending in -er and -ir.
            p1, p2 = v.endswith("er") and ("e", "é") or ("i", "e")
            return [v,
                b + "o", b + "es", b + "e", b + p1 + "mos", b + p2 + "is", b + "en", b + "iendo",
                b + "í", b + "iste", b + "ió", b + "imos", b + "isteis", b + "ieron", b + "ido",
                b + "ía", b + "ías", b + "ía", b + "íamos", b + "íais", b + "ían",
                v + "é", v + "ás", v + "á", v + "emos", v + "éis", v + "án",
                v + "ía", v + "ías", v + "ía", v + "íamos", v + "íais", v + "ían",
                b + "a", v[:-1] + "d",
                b + "a", b + "as", b + "a", b + "amos", b + "áis", b + "an",
                b + "iera", b + "ieras", b + "iera", b + "iéramos", b + "ierais", b + "ieran"]

verbs = Verbs()

conjugate, lemma, lexeme, tenses = \
    verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses

#### ATTRIBUTIVE & PREDICATIVE #####################################################################


def attributive(adjective, gender=MALE):
    w = adjective.lower()
    # normal => normales
    if PLURAL in gender and not is_vowel(w[-1:]):
        return w + "es"
    # el chico inteligente => los chicos inteligentes
    if PLURAL in gender and w.endswith(("a", "e")):
        return w + "s"
    # el chico alto => los chicos altos
    if w.endswith("o"):
        if FEMININE in gender and PLURAL in gender:
            return w[:-1] + "as"
        if FEMININE in gender:
            return w[:-1] + "a"
        if PLURAL in gender:
            return w + "s"
    return w

#print(attributive("intelligente", gender=PLURAL)) # intelligentes
#print(attributive("alto", gender=MALE+PLURAL))    # altos
#print(attributive("alto", gender=FEMALE+PLURAL))  # altas
#print(attributive("normal", gender=MALE))         # normal
#print(attributive("normal", gender=FEMALE))       # normal
#print(attributive("normal", gender=PLURAL))       # normales


def predicative(adjective):
    """ Returns the predicative adjective (lowercase).
        In Spanish, the attributive form is always used for descriptive adjectives:
        "el chico alto" => masculine,
        "la chica alta" => feminine.
        The predicative is useful for lemmatization.
    """
    w = adjective.lower()
    # histéricos => histérico
    if w.endswith(("os", "as")):
        w = w[:-1]
    # histérico => histérico
    if w.endswith("o"):
        return w
    # histérica => histérico
    if w.endswith("a"):
        return w[:-1] + "o"
    # horribles => horrible, humorales => humoral
    if w.endswith("es"):
        if len(w) >= 4 and not is_vowel(normalize(w[-3])) and not is_vowel(normalize(w[-4])):
            return w[:-1]
        return w[:-2]
    return w