You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

322 lines
13 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#### PATTERN | FR | INFLECT ########################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2013 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
####################################################################################################
# Regular expressions-based rules for French word inflection:
# - pluralization and singularization of nouns,
# - conjugation of verbs,
# - predicative and attributive of adjectives.
# Accuracy:
# 92% for pluralize()
# 93% for singularize()
# 80% for Verbs.find_lemma() (mixed regular/irregular)
# 86% for Verbs.find_lexeme() (mixed regular/irregular)
# 95% predicative() (measured on Lexique French morphology word forms)
from __future__ import unicode_literals
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
import os
import sys
import re
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))
from pattern.text import Verbs as _Verbs
from pattern.text import (
INFINITIVE, PRESENT, PAST, FUTURE,
FIRST, SECOND, THIRD,
SINGULAR, PLURAL, SG, PL,
INDICATIVE, IMPERATIVE, SUBJUNCTIVE, CONDITIONAL,
IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
IMPERFECT, PRETERITE,
PARTICIPLE, GERUND
)
sys.path.pop(0)
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
VOWELS = ("a", "e", "i", "o", "u")
re_vowel = re.compile(r"a|e|i|o|u", re.I)
is_vowel = lambda ch: ch in VOWELS
#### PLURALIZE #####################################################################################
plural_irregular = {
"bleu": "bleus",
"pneu": "pneus",
"travail": "travaux",
"vitrail": "vitraux"
}
def pluralize(word, pos=NOUN, custom={}):
""" Returns the plural of a given word.
The custom dictionary is for user-defined replacements.
"""
if word in custom:
return custom[word]
w = word.lower()
if w in plural_irregular:
return plural_irregular[w]
if w.endswith(("ais", "ois")):
return w + "es"
if w.endswith(("s", "x")):
return w
if w.endswith("al"):
return w[:-2] + "aux"
if w.endswith(("au", "eu")):
return w + "x"
return w + "s"
#### SINGULARIZE ###################################################################################
def singularize(word, pos=NOUN, custom={}):
if word in custom:
return custom[word]
w = word.lower()
# Common articles, determiners, pronouns:
if pos in ("DT", "PRP", "PRP$", "WP", "RB", "IN"):
if w == "du":
return "de"
if w == "ces":
return "ce"
if w == "les":
return "le"
if w == "des":
return "un"
if w == "mes":
return "mon"
if w == "ses":
return "son"
if w == "tes":
return "ton"
if w == "nos":
return "notre"
if w == "vos":
return "votre"
if w.endswith(("'", "")):
return w[:-1] + "e"
if w.endswith("nnes"): # parisiennes => parisien
return w[:-3]
if w.endswith("ntes"): # passantes => passant
return w[:-2]
if w.endswith("euses"): # danseuses => danseur
return w[:-3] + "r"
if w.endswith("s"):
return w[:-1]
if w.endswith(("aux", "eux", "oux")):
return w[:-1]
if w.endswith("ii"):
return w[:-1] + "o"
if w.endswith(("ia", "ma")):
return w[:-1] + "um"
if "-" in w:
return singularize(w.split("-")[0]) + "-" + "-".join(w.split("-")[1:])
return w
#### VERB CONJUGATION ##############################################################################
verb_inflections = [
("issaient", "ir" ), ("eassions", "er" ), ("dissions", "dre" ), ("çassions", "cer" ),
( "eraient", "er" ), ( "assions", "er" ), ( "issions", "ir" ), ( "iraient", "ir" ),
( "isaient", "ire" ), ( "geaient", "ger" ), ( "eassent", "er" ), ( "geasses", "ger" ),
( "eassiez", "er" ), ( "dissiez", "dre" ), ( "dissent", "dre" ), ( "endrons", "endre"),
( "endriez", "endre"), ( "endrais", "endre"), ( "erions", "er" ), ( "assent", "er" ),
( "assiez", "er" ), ( "raient", "re" ), ( "issent", "ir" ), ( "issiez", "ir" ),
( "irions", "ir" ), ( "issons", "ir" ), ( "issant", "ir" ), ( "issait", "ir" ),
( "issais", "ir" ), ( "aient", "er" ), ( "èrent", "er" ), ( "erait", "er" ),
( "eront", "er" ), ( "erons", "er" ), ( "eriez", "er" ), ( "erais", "er" ),
( "asses", "er" ), ( "rions", "re" ), ( "isses", "ir" ), ( "irent", "ir" ),
( "irait", "ir" ), ( "irons", "ir" ), ( "iriez", "ir" ), ( "irais", "ir" ),
( "iront", "ir" ), ( "issez", "ir" ), ( "ions", "er" ), ( "erez", "er" ),
( "eras", "er" ), ( "erai", "er" ), ( "asse", "er" ), ( "âtes", "er" ),
( "âmes", "er" ), ( "isse", "ir" ), ( "îtes", "ir" ), ( "îmes", "ir" ),
( "irez", "ir" ), ( "iras", "ir" ), ( "irai", "ir" ), ( "ront", "re" ),
( "iez", "er" ), ( "ent", "er" ), ( "ais", "er" ), ( "ons", "er" ),
( "ait", "er" ), ( "ant", "er" ), ( "era", "er" ), ( "ira", "ir" ),
( "es", "er" ), ( "ez", "er" ), ( "as", "er" ), ( "ai", "er" ),
( "ât", "er" ), ( "ds", "dre" ), ( "is", "ir" ), ( "it", "ir" ),
( "ît", "ir" ), ( "ïr", "ïr" ), ( "nd", "ndre"), ( "nu", "nir" ),
( "e", "er" ), ( "é", "er" ), ( "a", "er" ), ( "t", "re" ),
( "s", "re" ), ( "i", "ir" ), ( "û", "ir" ), ( "u", "re" ),
( "d", "dre" )
]
class Verbs(_Verbs):
def __init__(self):
_Verbs.__init__(self, os.path.join(MODULE, "fr-verbs.txt"),
language = "fr",
default = {},
format = [
0, 1, 2, 3, 4, 5, 6, 8, 24, # indicatif présent
34, 35, 36, 37, 38, 39, # indicatif passé simple
17, 18, 19, 20, 21, 22, # indicatif imparfait
40, 41, 42, 43, 44, 45, # indicatif futur simple
46, 47, 48, 49, 50, 51, # conditionnel présent
52, 53, 54, # impératif présent
55, 56, 57, 58, 59, 60, # subjonctif présent
67, 68, 69, 70, 71, 72 # subjonctif imparfait
])
def find_lemma(self, verb):
""" Returns the base form of the given inflected verb, using a rule-based approach.
"""
# French has 20,000+ verbs, ending in -er (majority), -ir, -re.
v = verb.lower()
if v.endswith(("er", "ir", "re")):
return v
for a, b in verb_inflections:
if v.endswith(a):
return v[:-len(a)] + b
return v
def find_lexeme(self, verb):
""" For a regular verb (base form), returns the forms using a rule-based approach.
"""
v = verb.lower()
b = v[:-2]
if v.endswith("ir") and not \
v.endswith(("couvrir", "cueillir", "découvrir", "offrir", "ouvrir", "souffrir")):
# Regular inflection for verbs ending in -ir.
# Some -ir verbs drop the last letter of the stem: dormir => je dors (not: je dormis).
if v.endswith(("dormir", "mentir", "partir", "sentir", "servir", "sortir")):
b0 = b[:-1]
else:
b0 = b + "i"
return [v,
b0 + "s", b0 + "s", b0 + "t", b + "issons", b + "issez", b + "issent", b + "issant", b + "i",
b + "is", b + "is", b + "it", b + "îmes", b + "îtes", b + "irent",
b + "issais", b + "issais", b + "issait", b + "issions", b + "issiez", b + "issaient",
v + "ai", v + "as", v + "a", v + "ons", v + "ez", v + "ont",
v + "ais", v + "ais", v + "ait", v + "ions", v + "iez", v + "aient",
b + "is", b + "issons", b + "issez",
b + "isse", b + "isses", b + "isse", b + "issions", b + "issiez", b + "issent",
b + "isse", b + "isses", b + "ît", b + "issions", b + "issiez", b + "issent"
]
elif v.endswith("re"):
# Regular inflection for verbs ending in -re.
# Verbs ending in -attre and -ettre drop the -t in the singular form.
if v.endswith(("ttre")):
b0 = b1 = b[:-1]
else:
b0 = b1 = b
# Verbs ending in -aindre, -eindre and -oindre drop the -d.
if v.endswith("indre"):
b0, b1 = b[:-1], b[:-2] + "gn"
# Verbs ending in -prendre drop the -d in the plural form.
if v.endswith("prendre"):
b0, b1 = b, b[:-1]
return [v,
b0 + "s", b0 + "s", b0 + "", b1 + "ons", b1 + "ez", b1 + "ent", b1 + "ant", b + "u",
b + "is", b + "is", b + "it", b1 + "îmes", b1 + "îtes", b1 + "irent",
b + "ais", b + "ais", b + "ait", b1 + "ions", b1 + "iez", b1 + "aient",
b + "rai", b + "ras", b + "ra", b + "rons", b + "rez", b + "ront",
b + "ais", b + "ais", b + "ait", b1 + "ions", b1 + "iez", b1 + "aient",
b0 + "s", b1 + "ons", b1 + "ez",
b + "e", b + "es", b + "e", b1 + "ions", b1 + "iez", b1 + "ent",
b + "isse", b + "isses", b + "ît", b1 + "issions", b1 + "issiez", b1 + "issent"
]
else:
# Regular inflection for verbs ending in -er.
# If the stem ends in -g, use -ge before hard vowels -a and -o: manger => mangeons.
# If the stem ends in -c, use -ç before hard vowels -a and -o: lancer => lançons.
e = v.endswith("ger") and "e" or ""
c = v.endswith("cer") and b[:-1] + "ç" or b
return [v,
b + "e", b + "es", b + "e", c + e + "ons", b + "ez", b + "ent", c + e + "ant", b + "é",
c + e + "ai", c + e + "as", c + e + "a", c + e + "âmes", c + e + "âtes", b + "èrent",
c + e + "ais", c + e + "ais", c + e + "ait", b + "ions", b + "iez", c + e + "aient",
v + "ai", v + "as", v + "a", v + "ons", v + "ez", v + "ont",
v + "ais", v + "ais", v + "ait", v + "ions", v + "iez", v + "aient",
b + "e", c + e + "ons", b + "ez",
b + "e", b + "es", b + "e", b + "ions", b + "iez", b + "ent",
c + e + "asse", c + e + "asses", c + e + "ât", c + e + "assions", c + e + "assiez", c + e + "assent"
]
verbs = Verbs()
conjugate, lemma, lexeme, tenses = \
verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses
#### ATTRIBUTIVE & PREDICATIVE #####################################################################
def attributive(adjective):
""" For a predicative adjective, returns the attributive form.
"""
# Must deal with feminine and plural.
raise NotImplementedError
def predicative(adjective):
""" Returns the predicative adjective (lowercase): belles => beau.
"""
w = adjective.lower()
if w.endswith(("ais", "ois")):
return w
if w.endswith(("és", "ée", "ées")):
return w.rstrip("es")
if w.endswith(("que", "ques")):
return w.rstrip("s")
if w.endswith(("nts", "nte", "ntes")):
return w.rstrip("es")
if w.endswith("eaux"):
return w.rstrip("x")
if w.endswith(("aux", "ale", "ales")):
return w.rstrip("uxles") + "l"
if w.endswith(("rteuse", "rteuses", "ailleuse")):
return w.rstrip("es") + "r"
if w.endswith(("euse", "euses")):
return w.rstrip("es") + "x"
if w.endswith(("els", "elle", "elles")):
return w.rstrip("les") + "el"
if w.endswith(("ifs", "ive", "ives")):
return w.rstrip("es")[:-2] + "if"
if w.endswith(("is", "ie", "ies")):
return w.rstrip("es")
if w.endswith(("enne", "ennes")):
return w.rstrip("nes") + "en"
if w.endswith(("onne", "onnes")):
return w.rstrip("nes") + "n"
if w.endswith(("igne", "ignes", "ingue", "ingues")):
return w.rstrip("s")
if w.endswith(("ène", "ènes")):
return w.rstrip("s")
if w.endswith(("ns", "ne", "nes")):
return w.rstrip("es")
if w.endswith(("ite", "ites")):
return w.rstrip("es")
if w.endswith(("is", "ise", "ises")):
return w.rstrip("es") + "s"
if w.endswith(("rice", "rices")):
return w.rstrip("rices") + "eur"
if w.endswith(("iers", "ière", "ières")):
return w.rstrip("es")[:-3] + "ier"
if w.endswith(("ette", "ettes")):
return w.rstrip("tes") + "et"
if w.endswith(("rds", "rde", "rdes")):
return w.rstrip("es")
if w.endswith(("nds", "nde", "ndes")):
return w.rstrip("es")
if w.endswith(("us", "ue", "ues")):
return w.rstrip("es")
return w.rstrip("s")