bo-graduation/nltk-book/pattern-master/pattern/text/fr/inflect.py

#### PATTERN | FR | INFLECT ########################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2013 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).

####################################################################################################
# Regular expressions-based rules for French word inflection:
# - pluralization and singularization of nouns,
# - conjugation of verbs,
# - predicative and attributive of adjectives.

# Accuracy:
# 92% for pluralize()
# 93% for singularize()
# 80% for Verbs.find_lemma() (mixed regular/irregular)
# 86% for Verbs.find_lexeme() (mixed regular/irregular)
# 95% predicative() (measured on Lexique French morphology word forms)

from __future__ import unicode_literals
from __future__ import division

from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range

import os
import sys
import re

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))

from pattern.text import Verbs as _Verbs
from pattern.text import (
    INFINITIVE, PRESENT, PAST, FUTURE,
    FIRST, SECOND, THIRD,
    SINGULAR, PLURAL, SG, PL,
    INDICATIVE, IMPERATIVE, SUBJUNCTIVE, CONDITIONAL,
    IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
    IMPERFECT, PRETERITE,
    PARTICIPLE, GERUND
)

sys.path.pop(0)

VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"

VOWELS = ("a", "e", "i", "o", "u")
re_vowel = re.compile(r"a|e|i|o|u", re.I)
is_vowel = lambda ch: ch in VOWELS

#### PLURALIZE #####################################################################################

plural_irregular = {
       "bleu": "bleus",
       "pneu": "pneus",
    "travail": "travaux",
    "vitrail": "vitraux"
}


def pluralize(word, pos=NOUN, custom={}):
    """ Returns the plural of a given word.
        The custom dictionary is for user-defined replacements.
    """
    if word in custom:
        return custom[word]
    w = word.lower()
    if w in plural_irregular:
        return plural_irregular[w]
    if w.endswith(("ais", "ois")):
        return w + "es"
    if w.endswith(("s", "x")):
        return w
    if w.endswith("al"):
        return w[:-2] + "aux"
    if w.endswith(("au", "eu")):
        return w + "x"
    return w + "s"

#### SINGULARIZE ###################################################################################


def singularize(word, pos=NOUN, custom={}):
    if word in custom:
        return custom[word]
    w = word.lower()
    # Common articles, determiners, pronouns:
    if pos in ("DT", "PRP", "PRP$", "WP", "RB", "IN"):
        if w == "du":
            return "de"
        if w == "ces":
            return "ce"
        if w == "les":
            return "le"
        if w == "des":
            return "un"
        if w == "mes":
            return "mon"
        if w == "ses":
            return "son"
        if w == "tes":
            return "ton"
        if w == "nos":
            return "notre"
        if w == "vos":
            return "votre"
        if w.endswith(("'", "’")):
            return w[:-1] + "e"
    if w.endswith("nnes"):  # parisiennes => parisien
        return w[:-3]
    if w.endswith("ntes"):  # passantes => passant
        return w[:-2]
    if w.endswith("euses"): # danseuses => danseur
        return w[:-3] + "r"
    if w.endswith("s"):
        return w[:-1]
    if w.endswith(("aux", "eux", "oux")):
        return w[:-1]
    if w.endswith("ii"):
        return w[:-1] + "o"
    if w.endswith(("ia", "ma")):
        return w[:-1] + "um"
    if "-" in w:
        return singularize(w.split("-")[0]) + "-" + "-".join(w.split("-")[1:])
    return w

#### VERB CONJUGATION ##############################################################################

verb_inflections = [
    ("issaient", "ir"   ), ("eassions", "er"   ), ("dissions", "dre" ), ("çassions", "cer"  ),
    ( "eraient", "er"   ), ( "assions", "er"   ), ( "issions", "ir"  ), (  "iraient", "ir"   ),
    ( "isaient", "ire"  ), ( "geaient", "ger"  ), ( "eassent", "er"  ), (  "geasses", "ger"  ),
    ( "eassiez", "er"   ), ( "dissiez", "dre"  ), ( "dissent", "dre" ), (  "endrons", "endre"),
    ( "endriez", "endre"), ( "endrais", "endre"), (  "erions", "er"  ), (   "assent", "er"   ),
    (  "assiez", "er"   ), (  "raient", "re"   ), (  "issent", "ir"  ), (   "issiez", "ir"   ),
    (  "irions", "ir"   ), (  "issons", "ir"   ), (  "issant", "ir"  ), (   "issait", "ir"   ),
    (  "issais", "ir"   ), (   "aient", "er"   ), (  "èrent", "er"  ), (    "erait", "er"   ),
    (   "eront", "er"   ), (   "erons", "er"   ), (   "eriez", "er"  ), (    "erais", "er"   ),
    (   "asses", "er"   ), (   "rions", "re"   ), (   "isses", "ir"  ), (    "irent", "ir"   ),
    (   "irait", "ir"   ), (   "irons", "ir"   ), (   "iriez", "ir"  ), (    "irais", "ir"   ),
    (   "iront", "ir"   ), (   "issez", "ir"   ), (    "ions", "er"  ), (     "erez", "er"   ),
    (    "eras", "er"   ), (    "erai", "er"   ), (    "asse", "er"  ), (     "âtes", "er"   ),
    (    "âmes", "er"   ), (    "isse", "ir"   ), (    "îtes", "ir"  ), (     "îmes", "ir"   ),
    (    "irez", "ir"   ), (    "iras", "ir"   ), (    "irai", "ir"  ), (     "ront", "re"   ),
    (     "iez", "er"   ), (     "ent", "er"   ), (     "ais", "er"  ), (      "ons", "er"   ),
    (     "ait", "er"   ), (     "ant", "er"   ), (     "era", "er"  ), (      "ira", "ir"   ),
    (      "es", "er"   ), (      "ez", "er"   ), (      "as", "er"  ), (       "ai", "er"   ),
    (      "ât", "er"   ), (      "ds", "dre"  ), (      "is", "ir"  ), (       "it", "ir"   ),
    (      "ît", "ir"   ), (      "ïr", "ïr"   ), (      "nd", "ndre"), (       "nu", "nir"  ),
    (       "e", "er"   ), (       "é", "er"   ), (       "a", "er"  ), (        "t", "re"   ),
    (       "s", "re"   ), (       "i", "ir"   ), (       "û", "ir"  ), (        "u", "re"   ),
    (       "d", "dre"  )
]


class Verbs(_Verbs):

    def __init__(self):
        _Verbs.__init__(self, os.path.join(MODULE, "fr-verbs.txt"),
            language = "fr",
             default = {},
              format = [
                0, 1, 2, 3, 4, 5, 6, 8, 24, # indicatif présent
                34, 35, 36, 37, 38, 39,     # indicatif passé simple
                17, 18, 19, 20, 21, 22,     # indicatif imparfait
                40, 41, 42, 43, 44, 45,     # indicatif futur simple
                46, 47, 48, 49, 50, 51,     # conditionnel présent
                52, 53, 54,                 # impératif présent
                55, 56, 57, 58, 59, 60,     # subjonctif présent
                67, 68, 69, 70, 71, 72      # subjonctif imparfait
            ])

    def find_lemma(self, verb):
        """ Returns the base form of the given inflected verb, using a rule-based approach.
        """
        # French has 20,000+ verbs, ending in -er (majority), -ir, -re.
        v = verb.lower()
        if v.endswith(("er", "ir", "re")):
            return v
        for a, b in verb_inflections:
            if v.endswith(a):
                return v[:-len(a)] + b
        return v

    def find_lexeme(self, verb):
        """ For a regular verb (base form), returns the forms using a rule-based approach.
        """
        v = verb.lower()
        b = v[:-2]
        if v.endswith("ir") and not \
           v.endswith(("couvrir", "cueillir", "découvrir", "offrir", "ouvrir", "souffrir")):
            # Regular inflection for verbs ending in -ir.
            # Some -ir verbs drop the last letter of the stem: dormir => je dors (not: je dormis).
            if v.endswith(("dormir", "mentir", "partir", "sentir", "servir", "sortir")):
                b0 = b[:-1]
            else:
                b0 = b + "i"
            return [v,
                b0 + "s", b0 + "s", b0 + "t", b + "issons", b + "issez", b + "issent", b + "issant", b + "i",
                b + "is", b + "is", b + "it", b + "îmes", b + "îtes", b + "irent",
                b + "issais", b + "issais", b + "issait", b + "issions", b + "issiez", b + "issaient",
                v + "ai", v + "as", v + "a", v + "ons", v + "ez", v + "ont",
                v + "ais", v + "ais", v + "ait", v + "ions", v + "iez", v + "aient",
                b + "is", b + "issons", b + "issez",
                b + "isse", b + "isses", b + "isse", b + "issions", b + "issiez", b + "issent",
                b + "isse", b + "isses", b + "ît", b + "issions", b + "issiez", b + "issent"
            ]
        elif v.endswith("re"):
            # Regular inflection for verbs ending in -re.
            # Verbs ending in -attre and -ettre drop the -t in the singular form.
            if v.endswith(("ttre")):
                b0 = b1 = b[:-1]
            else:
                b0 = b1 = b
            # Verbs ending in -aindre, -eindre and -oindre drop the -d.
            if v.endswith("indre"):
                b0, b1 = b[:-1], b[:-2] + "gn"
            # Verbs ending in -prendre drop the -d in the plural form.
            if v.endswith("prendre"):
                b0, b1 = b, b[:-1]
            return [v,
                b0 + "s", b0 + "s", b0 + "", b1 + "ons", b1 + "ez", b1 + "ent", b1 + "ant", b + "u",
                b + "is", b + "is", b + "it", b1 + "îmes", b1 + "îtes", b1 + "irent",
                b + "ais", b + "ais", b + "ait", b1 + "ions", b1 + "iez", b1 + "aient",
                b + "rai", b + "ras", b + "ra", b + "rons", b + "rez", b + "ront",
                b + "ais", b + "ais", b + "ait", b1 + "ions", b1 + "iez", b1 + "aient",
                b0 + "s", b1 + "ons", b1 + "ez",
                b + "e", b + "es", b + "e", b1 + "ions", b1 + "iez", b1 + "ent",
                b + "isse", b + "isses", b + "ît", b1 + "issions", b1 + "issiez", b1 + "issent"
            ]
        else:
            # Regular inflection for verbs ending in -er.
            # If the stem ends in -g, use -ge before hard vowels -a and -o: manger => mangeons.
            # If the stem ends in -c, use -ç before hard vowels -a and -o: lancer => lançons.
            e = v.endswith("ger") and "e" or ""
            c = v.endswith("cer") and b[:-1] + "ç" or b
            return [v,
                b + "e", b + "es", b + "e", c + e + "ons", b + "ez", b + "ent", c + e + "ant", b + "é",
                c + e + "ai", c + e + "as", c + e + "a", c + e + "âmes", c + e + "âtes", b + "èrent",
                c + e + "ais", c + e + "ais", c + e + "ait", b + "ions", b + "iez", c + e + "aient",
                v + "ai", v + "as", v + "a", v + "ons", v + "ez", v + "ont",
                v + "ais", v + "ais", v + "ait", v + "ions", v + "iez", v + "aient",
                b + "e", c + e + "ons", b + "ez",
                b + "e", b + "es", b + "e", b + "ions", b + "iez", b + "ent",
                c + e + "asse", c + e + "asses", c + e + "ât", c + e + "assions", c + e + "assiez", c + e + "assent"
            ]

verbs = Verbs()

conjugate, lemma, lexeme, tenses = \
    verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses

#### ATTRIBUTIVE & PREDICATIVE #####################################################################


def attributive(adjective):
    """ For a predicative adjective, returns the attributive form.
    """
    # Must deal with feminine and plural.
    raise NotImplementedError


def predicative(adjective):
    """ Returns the predicative adjective (lowercase): belles => beau.
    """
    w = adjective.lower()
    if w.endswith(("ais", "ois")):
        return w
    if w.endswith(("és", "ée", "ées")):
        return w.rstrip("es")
    if w.endswith(("que", "ques")):
        return w.rstrip("s")
    if w.endswith(("nts", "nte", "ntes")):
        return w.rstrip("es")
    if w.endswith("eaux"):
        return w.rstrip("x")
    if w.endswith(("aux", "ale", "ales")):
        return w.rstrip("uxles") + "l"
    if w.endswith(("rteuse", "rteuses", "ailleuse")):
        return w.rstrip("es") + "r"
    if w.endswith(("euse", "euses")):
        return w.rstrip("es") + "x"
    if w.endswith(("els", "elle", "elles")):
        return w.rstrip("les") + "el"
    if w.endswith(("ifs", "ive", "ives")):
        return w.rstrip("es")[:-2] + "if"
    if w.endswith(("is", "ie", "ies")):
        return w.rstrip("es")
    if w.endswith(("enne", "ennes")):
        return w.rstrip("nes") + "en"
    if w.endswith(("onne", "onnes")):
        return w.rstrip("nes") + "n"
    if w.endswith(("igne", "ignes", "ingue", "ingues")):
        return w.rstrip("s")
    if w.endswith(("ène", "ènes")):
        return w.rstrip("s")
    if w.endswith(("ns", "ne", "nes")):
        return w.rstrip("es")
    if w.endswith(("ite", "ites")):
        return w.rstrip("es")
    if w.endswith(("is", "ise", "ises")):
        return w.rstrip("es") + "s"
    if w.endswith(("rice", "rices")):
        return w.rstrip("rices") + "eur"
    if w.endswith(("iers", "ière", "ières")):
        return w.rstrip("es")[:-3] + "ier"
    if w.endswith(("ette", "ettes")):
        return w.rstrip("tes") + "et"
    if w.endswith(("rds", "rde", "rdes")):
        return w.rstrip("es")
    if w.endswith(("nds", "nde", "ndes")):
        return w.rstrip("es")
    if w.endswith(("us", "ue", "ues")):
        return w.rstrip("es")
    return w.rstrip("s")