#### PATTERN | FR | INFLECT ######################################################################## # -*- coding: utf-8 -*- # Copyright (c) 2013 University of Antwerp, Belgium # Author: Tom De Smedt # License: BSD (see LICENSE.txt for details). #################################################################################################### # Regular expressions-based rules for French word inflection: # - pluralization and singularization of nouns, # - conjugation of verbs, # - predicative and attributive of adjectives. # Accuracy: # 92% for pluralize() # 93% for singularize() # 80% for Verbs.find_lemma() (mixed regular/irregular) # 86% for Verbs.find_lexeme() (mixed regular/irregular) # 95% predicative() (measured on Lexique French morphology word forms) from __future__ import unicode_literals from __future__ import division from builtins import str, bytes, dict, int from builtins import map, zip, filter from builtins import object, range import os import sys import re try: MODULE = os.path.dirname(os.path.realpath(__file__)) except: MODULE = "" sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", "..")) from pattern.text import Verbs as _Verbs from pattern.text import ( INFINITIVE, PRESENT, PAST, FUTURE, FIRST, SECOND, THIRD, SINGULAR, PLURAL, SG, PL, INDICATIVE, IMPERATIVE, SUBJUNCTIVE, CONDITIONAL, IMPERFECTIVE, PERFECTIVE, PROGRESSIVE, IMPERFECT, PRETERITE, PARTICIPLE, GERUND ) sys.path.pop(0) VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" VOWELS = ("a", "e", "i", "o", "u") re_vowel = re.compile(r"a|e|i|o|u", re.I) is_vowel = lambda ch: ch in VOWELS #### PLURALIZE ##################################################################################### plural_irregular = { "bleu": "bleus", "pneu": "pneus", "travail": "travaux", "vitrail": "vitraux" } def pluralize(word, pos=NOUN, custom={}): """ Returns the plural of a given word. The custom dictionary is for user-defined replacements. """ if word in custom: return custom[word] w = word.lower() if w in plural_irregular: return plural_irregular[w] if w.endswith(("ais", "ois")): return w + "es" if w.endswith(("s", "x")): return w if w.endswith("al"): return w[:-2] + "aux" if w.endswith(("au", "eu")): return w + "x" return w + "s" #### SINGULARIZE ################################################################################### def singularize(word, pos=NOUN, custom={}): if word in custom: return custom[word] w = word.lower() # Common articles, determiners, pronouns: if pos in ("DT", "PRP", "PRP$", "WP", "RB", "IN"): if w == "du": return "de" if w == "ces": return "ce" if w == "les": return "le" if w == "des": return "un" if w == "mes": return "mon" if w == "ses": return "son" if w == "tes": return "ton" if w == "nos": return "notre" if w == "vos": return "votre" if w.endswith(("'", "’")): return w[:-1] + "e" if w.endswith("nnes"): # parisiennes => parisien return w[:-3] if w.endswith("ntes"): # passantes => passant return w[:-2] if w.endswith("euses"): # danseuses => danseur return w[:-3] + "r" if w.endswith("s"): return w[:-1] if w.endswith(("aux", "eux", "oux")): return w[:-1] if w.endswith("ii"): return w[:-1] + "o" if w.endswith(("ia", "ma")): return w[:-1] + "um" if "-" in w: return singularize(w.split("-")[0]) + "-" + "-".join(w.split("-")[1:]) return w #### VERB CONJUGATION ############################################################################## verb_inflections = [ ("issaient", "ir" ), ("eassions", "er" ), ("dissions", "dre" ), ("çassions", "cer" ), ( "eraient", "er" ), ( "assions", "er" ), ( "issions", "ir" ), ( "iraient", "ir" ), ( "isaient", "ire" ), ( "geaient", "ger" ), ( "eassent", "er" ), ( "geasses", "ger" ), ( "eassiez", "er" ), ( "dissiez", "dre" ), ( "dissent", "dre" ), ( "endrons", "endre"), ( "endriez", "endre"), ( "endrais", "endre"), ( "erions", "er" ), ( "assent", "er" ), ( "assiez", "er" ), ( "raient", "re" ), ( "issent", "ir" ), ( "issiez", "ir" ), ( "irions", "ir" ), ( "issons", "ir" ), ( "issant", "ir" ), ( "issait", "ir" ), ( "issais", "ir" ), ( "aient", "er" ), ( "èrent", "er" ), ( "erait", "er" ), ( "eront", "er" ), ( "erons", "er" ), ( "eriez", "er" ), ( "erais", "er" ), ( "asses", "er" ), ( "rions", "re" ), ( "isses", "ir" ), ( "irent", "ir" ), ( "irait", "ir" ), ( "irons", "ir" ), ( "iriez", "ir" ), ( "irais", "ir" ), ( "iront", "ir" ), ( "issez", "ir" ), ( "ions", "er" ), ( "erez", "er" ), ( "eras", "er" ), ( "erai", "er" ), ( "asse", "er" ), ( "âtes", "er" ), ( "âmes", "er" ), ( "isse", "ir" ), ( "îtes", "ir" ), ( "îmes", "ir" ), ( "irez", "ir" ), ( "iras", "ir" ), ( "irai", "ir" ), ( "ront", "re" ), ( "iez", "er" ), ( "ent", "er" ), ( "ais", "er" ), ( "ons", "er" ), ( "ait", "er" ), ( "ant", "er" ), ( "era", "er" ), ( "ira", "ir" ), ( "es", "er" ), ( "ez", "er" ), ( "as", "er" ), ( "ai", "er" ), ( "ât", "er" ), ( "ds", "dre" ), ( "is", "ir" ), ( "it", "ir" ), ( "ît", "ir" ), ( "ïr", "ïr" ), ( "nd", "ndre"), ( "nu", "nir" ), ( "e", "er" ), ( "é", "er" ), ( "a", "er" ), ( "t", "re" ), ( "s", "re" ), ( "i", "ir" ), ( "û", "ir" ), ( "u", "re" ), ( "d", "dre" ) ] class Verbs(_Verbs): def __init__(self): _Verbs.__init__(self, os.path.join(MODULE, "fr-verbs.txt"), language = "fr", default = {}, format = [ 0, 1, 2, 3, 4, 5, 6, 8, 24, # indicatif présent 34, 35, 36, 37, 38, 39, # indicatif passé simple 17, 18, 19, 20, 21, 22, # indicatif imparfait 40, 41, 42, 43, 44, 45, # indicatif futur simple 46, 47, 48, 49, 50, 51, # conditionnel présent 52, 53, 54, # impératif présent 55, 56, 57, 58, 59, 60, # subjonctif présent 67, 68, 69, 70, 71, 72 # subjonctif imparfait ]) def find_lemma(self, verb): """ Returns the base form of the given inflected verb, using a rule-based approach. """ # French has 20,000+ verbs, ending in -er (majority), -ir, -re. v = verb.lower() if v.endswith(("er", "ir", "re")): return v for a, b in verb_inflections: if v.endswith(a): return v[:-len(a)] + b return v def find_lexeme(self, verb): """ For a regular verb (base form), returns the forms using a rule-based approach. """ v = verb.lower() b = v[:-2] if v.endswith("ir") and not \ v.endswith(("couvrir", "cueillir", "découvrir", "offrir", "ouvrir", "souffrir")): # Regular inflection for verbs ending in -ir. # Some -ir verbs drop the last letter of the stem: dormir => je dors (not: je dormis). if v.endswith(("dormir", "mentir", "partir", "sentir", "servir", "sortir")): b0 = b[:-1] else: b0 = b + "i" return [v, b0 + "s", b0 + "s", b0 + "t", b + "issons", b + "issez", b + "issent", b + "issant", b + "i", b + "is", b + "is", b + "it", b + "îmes", b + "îtes", b + "irent", b + "issais", b + "issais", b + "issait", b + "issions", b + "issiez", b + "issaient", v + "ai", v + "as", v + "a", v + "ons", v + "ez", v + "ont", v + "ais", v + "ais", v + "ait", v + "ions", v + "iez", v + "aient", b + "is", b + "issons", b + "issez", b + "isse", b + "isses", b + "isse", b + "issions", b + "issiez", b + "issent", b + "isse", b + "isses", b + "ît", b + "issions", b + "issiez", b + "issent" ] elif v.endswith("re"): # Regular inflection for verbs ending in -re. # Verbs ending in -attre and -ettre drop the -t in the singular form. if v.endswith(("ttre")): b0 = b1 = b[:-1] else: b0 = b1 = b # Verbs ending in -aindre, -eindre and -oindre drop the -d. if v.endswith("indre"): b0, b1 = b[:-1], b[:-2] + "gn" # Verbs ending in -prendre drop the -d in the plural form. if v.endswith("prendre"): b0, b1 = b, b[:-1] return [v, b0 + "s", b0 + "s", b0 + "", b1 + "ons", b1 + "ez", b1 + "ent", b1 + "ant", b + "u", b + "is", b + "is", b + "it", b1 + "îmes", b1 + "îtes", b1 + "irent", b + "ais", b + "ais", b + "ait", b1 + "ions", b1 + "iez", b1 + "aient", b + "rai", b + "ras", b + "ra", b + "rons", b + "rez", b + "ront", b + "ais", b + "ais", b + "ait", b1 + "ions", b1 + "iez", b1 + "aient", b0 + "s", b1 + "ons", b1 + "ez", b + "e", b + "es", b + "e", b1 + "ions", b1 + "iez", b1 + "ent", b + "isse", b + "isses", b + "ît", b1 + "issions", b1 + "issiez", b1 + "issent" ] else: # Regular inflection for verbs ending in -er. # If the stem ends in -g, use -ge before hard vowels -a and -o: manger => mangeons. # If the stem ends in -c, use -ç before hard vowels -a and -o: lancer => lançons. e = v.endswith("ger") and "e" or "" c = v.endswith("cer") and b[:-1] + "ç" or b return [v, b + "e", b + "es", b + "e", c + e + "ons", b + "ez", b + "ent", c + e + "ant", b + "é", c + e + "ai", c + e + "as", c + e + "a", c + e + "âmes", c + e + "âtes", b + "èrent", c + e + "ais", c + e + "ais", c + e + "ait", b + "ions", b + "iez", c + e + "aient", v + "ai", v + "as", v + "a", v + "ons", v + "ez", v + "ont", v + "ais", v + "ais", v + "ait", v + "ions", v + "iez", v + "aient", b + "e", c + e + "ons", b + "ez", b + "e", b + "es", b + "e", b + "ions", b + "iez", b + "ent", c + e + "asse", c + e + "asses", c + e + "ât", c + e + "assions", c + e + "assiez", c + e + "assent" ] verbs = Verbs() conjugate, lemma, lexeme, tenses = \ verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses #### ATTRIBUTIVE & PREDICATIVE ##################################################################### def attributive(adjective): """ For a predicative adjective, returns the attributive form. """ # Must deal with feminine and plural. raise NotImplementedError def predicative(adjective): """ Returns the predicative adjective (lowercase): belles => beau. """ w = adjective.lower() if w.endswith(("ais", "ois")): return w if w.endswith(("és", "ée", "ées")): return w.rstrip("es") if w.endswith(("que", "ques")): return w.rstrip("s") if w.endswith(("nts", "nte", "ntes")): return w.rstrip("es") if w.endswith("eaux"): return w.rstrip("x") if w.endswith(("aux", "ale", "ales")): return w.rstrip("uxles") + "l" if w.endswith(("rteuse", "rteuses", "ailleuse")): return w.rstrip("es") + "r" if w.endswith(("euse", "euses")): return w.rstrip("es") + "x" if w.endswith(("els", "elle", "elles")): return w.rstrip("les") + "el" if w.endswith(("ifs", "ive", "ives")): return w.rstrip("es")[:-2] + "if" if w.endswith(("is", "ie", "ies")): return w.rstrip("es") if w.endswith(("enne", "ennes")): return w.rstrip("nes") + "en" if w.endswith(("onne", "onnes")): return w.rstrip("nes") + "n" if w.endswith(("igne", "ignes", "ingue", "ingues")): return w.rstrip("s") if w.endswith(("ène", "ènes")): return w.rstrip("s") if w.endswith(("ns", "ne", "nes")): return w.rstrip("es") if w.endswith(("ite", "ites")): return w.rstrip("es") if w.endswith(("is", "ise", "ises")): return w.rstrip("es") + "s" if w.endswith(("rice", "rices")): return w.rstrip("rices") + "eur" if w.endswith(("iers", "ière", "ières")): return w.rstrip("es")[:-3] + "ier" if w.endswith(("ette", "ettes")): return w.rstrip("tes") + "et" if w.endswith(("rds", "rde", "rdes")): return w.rstrip("es") if w.endswith(("nds", "nde", "ndes")): return w.rstrip("es") if w.endswith(("us", "ue", "ues")): return w.rstrip("es") return w.rstrip("s")