You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
830 lines
35 KiB
Python
830 lines
35 KiB
Python
#### PATTERN | EN | INFLECT ########################################################################
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (c) 2010 University of Antwerp, Belgium
|
|
# Author: Tom De Smedt <tom@organisms.be>
|
|
# License: BSD (see LICENSE.txt for details).
|
|
|
|
####################################################################################################
|
|
# Regular expressions-based rules for English word inflection:
|
|
# - pluralization and singularization of nouns and adjectives,
|
|
# - conjugation of verbs,
|
|
# - comparative and superlative of adjectives.
|
|
|
|
# Accuracy (measured on CELEX English morphology word forms):
|
|
# 95% for pluralize()
|
|
# 96% for singularize()
|
|
# 95% for Verbs.find_lemma() (for regular verbs)
|
|
# 96% for Verbs.find_lexeme() (for regular verbs)
|
|
|
|
from __future__ import unicode_literals
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip, filter
|
|
from builtins import object, range
|
|
|
|
import os
|
|
import sys
|
|
import re
|
|
|
|
try:
|
|
MODULE = os.path.dirname(os.path.realpath(__file__))
|
|
except:
|
|
MODULE = ""
|
|
|
|
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))
|
|
|
|
from pattern.text import Verbs as _Verbs
|
|
from pattern.text import (
|
|
INFINITIVE, PRESENT, PAST, FUTURE,
|
|
FIRST, SECOND, THIRD,
|
|
SINGULAR, PLURAL, SG, PL,
|
|
PROGRESSIVE,
|
|
PARTICIPLE
|
|
)
|
|
|
|
sys.path.pop(0)
|
|
|
|
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
|
|
|
|
VOWELS = "aeiouy"
|
|
re_vowel = re.compile(r"a|e|i|o|u|y", re.I)
|
|
is_vowel = lambda ch: ch in VOWELS
|
|
|
|
#### ARTICLE #######################################################################################
|
|
# Based on the Ruby Linguistics module by Michael Granger:
|
|
# http://www.deveiate.org/projects/Linguistics/wiki/English
|
|
|
|
RE_ARTICLE = list(map(lambda x: (re.compile(x[0]), x[1]), (
|
|
(r"euler|hour(?!i)|heir|honest|hono", "an"), # exceptions: an hour, an honor
|
|
# Abbreviations:
|
|
# strings of capitals starting with a vowel-sound consonant followed by another consonant,
|
|
# which are not likely to be real words.
|
|
(r"(?!FJO|[HLMNS]Y.|RY[EO]|SQU|(F[LR]?|[HL]|MN?|N|RH?|S[CHKLMNPTVW]?|X(YL)?)[AEIOU])[FHLMNRSX][A-Z]", "an"),
|
|
(r"^[aefhilmnorsx][.-]" , "an"), # hyphenated: an f-16, an e-mail
|
|
(r"^[a-z][.-]" , "a" ), # hyphenated: a b-52
|
|
(r"^[^aeiouy]" , "a" ), # consonants: a bear
|
|
(r"^e[uw]" , "a" ), # -eu like "you": a european
|
|
(r"^onc?e" , "a" ), # -o like "wa" : a one-liner
|
|
(r"uni([^nmd]|mo)" , "a" ), # -u like "you": a university
|
|
(r"^u[bcfhjkqrst][aeiou]", "a" ), # -u like "you": a uterus
|
|
(r"^[aeiou]" , "an"), # vowels: an owl
|
|
(r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"), # y like "i": an yclept, a year
|
|
(r"" , "a" ) # guess "a"
|
|
)))
|
|
|
|
|
|
def definite_article(word):
|
|
return "the"
|
|
|
|
|
|
def indefinite_article(word):
|
|
""" Returns the indefinite article for a given word.
|
|
For example: indefinite_article("university") => "a" university.
|
|
"""
|
|
word = word.split(" ")[0]
|
|
for rule, article in RE_ARTICLE:
|
|
if rule.search(word) is not None:
|
|
return article
|
|
|
|
DEFINITE, INDEFINITE = \
|
|
"definite", "indefinite"
|
|
|
|
|
|
def article(word, function=INDEFINITE):
|
|
""" Returns the indefinite (a or an) or definite (the) article for the given word.
|
|
"""
|
|
return function == DEFINITE and definite_article(word) or indefinite_article(word)
|
|
|
|
_article = article
|
|
|
|
|
|
def referenced(word, article=INDEFINITE):
|
|
""" Returns a string with the article + the word.
|
|
"""
|
|
return "%s %s" % (_article(word, article), word)
|
|
|
|
#print referenced("hour")
|
|
#print referenced("FBI")
|
|
#print referenced("bear")
|
|
#print referenced("one-liner")
|
|
#print referenced("european")
|
|
#print referenced("university")
|
|
#print referenced("uterus")
|
|
#print referenced("owl")
|
|
#print referenced("yclept")
|
|
#print referenced("year")
|
|
|
|
#### PLURALIZE #####################################################################################
|
|
# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway:
|
|
# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
|
|
|
|
# Prepositions are used in forms like "mother-in-law" and "man at arms".
|
|
plural_prepositions = set((
|
|
"about" , "before" , "during", "of" , "till" ,
|
|
"above" , "behind" , "except", "off" , "to" ,
|
|
"across" , "below" , "for" , "on" , "under",
|
|
"after" , "beneath", "from" , "onto" , "until",
|
|
"among" , "beside" , "in" , "out" , "unto" ,
|
|
"around" , "besides", "into" , "over" , "upon" ,
|
|
"at" , "between", "near" , "since", "with" ,
|
|
"athwart", "betwixt",
|
|
"beyond",
|
|
"but",
|
|
"by"))
|
|
|
|
# Inflection rules that are either:
|
|
# - general,
|
|
# - apply to a certain category of words,
|
|
# - apply to a certain category of words only in classical mode,
|
|
# - apply only in classical mode.
|
|
# Each rule is a (suffix, inflection, category, classic)-tuple.
|
|
plural_rules = [
|
|
# 0) Indefinite articles and demonstratives.
|
|
(( r"^a$|^an$", "some" , None, False),
|
|
( r"^this$", "these" , None, False),
|
|
( r"^that$", "those" , None, False),
|
|
( r"^any$", "all" , None, False)
|
|
), # 1) Possessive adjectives.
|
|
(( r"^my$", "our" , None, False),
|
|
( r"^your$", "your" , None, False),
|
|
( r"^thy$", "your" , None, False),
|
|
(r"^her$|^his$", "their" , None, False),
|
|
( r"^its$", "their" , None, False),
|
|
( r"^their$", "their" , None, False)
|
|
), # 2) Possessive pronouns.
|
|
(( r"^mine$", "ours" , None, False),
|
|
( r"^yours$", "yours" , None, False),
|
|
( r"^thine$", "yours" , None, False),
|
|
(r"^her$|^his$", "theirs" , None, False),
|
|
( r"^its$", "theirs" , None, False),
|
|
( r"^their$", "theirs" , None, False)
|
|
), # 3) Personal pronouns.
|
|
(( r"^I$", "we" , None, False),
|
|
( r"^me$", "us" , None, False),
|
|
( r"^myself$", "ourselves" , None, False),
|
|
( r"^you$", "you" , None, False),
|
|
(r"^thou$|^thee$", "ye" , None, False),
|
|
( r"^yourself$", "yourself" , None, False),
|
|
( r"^thyself$", "yourself" , None, False),
|
|
( r"^she$|^he$", "they" , None, False),
|
|
(r"^it$|^they$", "they" , None, False),
|
|
(r"^her$|^him$", "them" , None, False),
|
|
(r"^it$|^them$", "them" , None, False),
|
|
( r"^herself$", "themselves" , None, False),
|
|
( r"^himself$", "themselves" , None, False),
|
|
( r"^itself$", "themselves" , None, False),
|
|
( r"^themself$", "themselves" , None, False),
|
|
( r"^oneself$", "oneselves" , None, False)
|
|
), # 4) Words that do not inflect.
|
|
(( r"$", "" , "uninflected", False),
|
|
( r"$", "" , "uncountable", False),
|
|
( r"s$", "s" , "s-singular" , False),
|
|
( r"fish$", "fish" , None, False),
|
|
(r"([- ])bass$", "\\1bass" , None, False),
|
|
( r"ois$", "ois" , None, False),
|
|
( r"sheep$", "sheep" , None, False),
|
|
( r"deer$", "deer" , None, False),
|
|
( r"pox$", "pox" , None, False),
|
|
(r"([A-Z].*)ese$", "\\1ese" , None, False),
|
|
( r"itis$", "itis" , None, False),
|
|
(r"(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False)
|
|
), # 5) Irregular plural forms (e.g., mongoose, oxen).
|
|
(( r"atlas$", "atlantes" , None, True ),
|
|
( r"atlas$", "atlases" , None, False),
|
|
( r"beef$", "beeves" , None, True ),
|
|
( r"brother$", "brethren" , None, True ),
|
|
( r"child$", "children" , None, False),
|
|
( r"corpus$", "corpora" , None, True ),
|
|
( r"corpus$", "corpuses" , None, False),
|
|
( r"^cow$", "kine" , None, True ),
|
|
( r"ephemeris$", "ephemerides", None, False),
|
|
( r"ganglion$", "ganglia" , None, True ),
|
|
( r"genie$", "genii" , None, True ),
|
|
( r"genus$", "genera" , None, False),
|
|
( r"graffito$", "graffiti" , None, False),
|
|
( r"loaf$", "loaves" , None, False),
|
|
( r"money$", "monies" , None, True ),
|
|
( r"mongoose$", "mongooses" , None, False),
|
|
( r"mythos$", "mythoi" , None, False),
|
|
( r"octopus$", "octopodes" , None, True ),
|
|
( r"opus$", "opera" , None, True ),
|
|
( r"opus$", "opuses" , None, False),
|
|
( r"^ox$", "oxen" , None, False),
|
|
( r"penis$", "penes" , None, True ),
|
|
( r"penis$", "penises" , None, False),
|
|
( r"soliloquy$", "soliloquies", None, False),
|
|
( r"testis$", "testes" , None, False),
|
|
( r"trilby$", "trilbys" , None, False),
|
|
( r"turf$", "turves" , None, True ),
|
|
( r"numen$", "numena" , None, False),
|
|
( r"occiput$", "occipita" , None, True )
|
|
), # 6) Irregular inflections for common suffixes (e.g., synopses, mice, men).
|
|
(( r"man$", "men" , None, False),
|
|
( r"person$", "people" , None, False),
|
|
(r"([lm])ouse$", "\\1ice" , None, False),
|
|
( r"tooth$", "teeth" , None, False),
|
|
( r"goose$", "geese" , None, False),
|
|
( r"foot$", "feet" , None, False),
|
|
( r"zoon$", "zoa" , None, False),
|
|
( r"([csx])is$", "\\1es" , None, False)
|
|
), # 7) Fully assimilated classical inflections
|
|
# (e.g., vertebrae, codices).
|
|
(( r"ex$", "ices" , "ex-ices" , False),
|
|
( r"ex$", "ices" , "ex-ices*", True ), # * = classical mode
|
|
( r"um$", "a" , "um-a" , False),
|
|
( r"um$", "a" , "um-a*", True ),
|
|
( r"on$", "a" , "on-a" , False),
|
|
( r"a$", "ae" , "a-ae" , False),
|
|
( r"a$", "ae" , "a-ae*", True )
|
|
), # 8) Classical variants of modern inflections
|
|
# (e.g., stigmata, soprani).
|
|
(( r"trix$", "trices" , None, True),
|
|
( r"eau$", "eaux" , None, True),
|
|
( r"ieu$", "ieu" , None, True),
|
|
( r"([iay])nx$", "\\1nges" , None, True),
|
|
( r"en$", "ina" , "en-ina*", True),
|
|
( r"a$", "ata" , "a-ata*", True),
|
|
( r"is$", "ides" , "is-ides*", True),
|
|
( r"us$", "i" , "us-i*", True),
|
|
( r"us$", "us " , "us-us*", True),
|
|
( r"o$", "i" , "o-i*", True),
|
|
( r"$", "i" , "-i*", True),
|
|
( r"$", "im" , "-im*", True)
|
|
), # 9) -ch, -sh and -ss take -es in the plural
|
|
# (e.g., churches, classes).
|
|
(( r"([cs])h$", "\\1hes" , None, False),
|
|
( r"ss$", "sses" , None, False),
|
|
( r"x$", "xes" , None, False)
|
|
), # 10) -f or -fe sometimes take -ves in the plural
|
|
# (e.g, lives, wolves).
|
|
(( r"([aeo]l)f$", "\\1ves" , None, False),
|
|
( r"([^d]ea)f$", "\\1ves" , None, False),
|
|
( r"arf$", "arves" , None, False),
|
|
(r"([nlw]i)fe$", "\\1ves" , None, False),
|
|
), # 11) -y takes -ys if preceded by a vowel, -ies otherwise
|
|
# (e.g., storeys, Marys, stories).
|
|
((r"([aeiou])y$", "\\1ys" , None, False),
|
|
(r"([A-Z].*)y$", "\\1ys" , None, False),
|
|
( r"y$", "ies" , None, False)
|
|
), # 12) -o sometimes takes -os, -oes otherwise.
|
|
# -o is preceded by a vowel takes -os
|
|
# (e.g., lassos, potatoes, bamboos).
|
|
(( r"o$", "os", "o-os", False),
|
|
(r"([aeiou])o$", "\\1os" , None, False),
|
|
( r"o$", "oes" , None, False)
|
|
), # 13) Miltary stuff
|
|
# (e.g., Major Generals).
|
|
(( r"l$", "ls", "general-generals", False),
|
|
), # 14) Assume that the plural takes -s
|
|
# (cats, programmes, ...).
|
|
(( r"$", "s" , None, False),)
|
|
]
|
|
|
|
# For performance, compile the regular expressions once:
|
|
plural_rules = [[(re.compile(r[0]), r[1], r[2], r[3]) for r in grp] for grp in plural_rules]
|
|
|
|
# Suffix categories.
|
|
plural_categories = {
|
|
"uninflected": [
|
|
"bison" , "debris" , "headquarters" , "news" , "swine" ,
|
|
"bream" , "diabetes" , "herpes" , "pincers" , "trout" ,
|
|
"breeches" , "djinn" , "high-jinks" , "pliers" , "tuna" ,
|
|
"britches" , "eland" , "homework" , "proceedings", "whiting" ,
|
|
"carp" , "elk" , "innings" , "rabies" , "wildebeest" ,
|
|
"chassis" , "flounder" , "jackanapes" , "salmon" ,
|
|
"clippers" , "gallows" , "mackerel" , "scissors" ,
|
|
"cod" , "graffiti" , "measles" , "series" ,
|
|
"contretemps", "mews" , "shears" ,
|
|
"corps" , "mumps" , "species"
|
|
],
|
|
"uncountable": [
|
|
"advice" , "fruit" , "ketchup" , "meat" , "sand" ,
|
|
"bread" , "furniture" , "knowledge" , "mustard" , "software" ,
|
|
"butter" , "garbage" , "love" , "news" , "understanding",
|
|
"cheese" , "gravel" , "luggage" , "progress" , "water" ,
|
|
"electricity", "happiness" , "mathematics" , "research" ,
|
|
"equipment" , "information", "mayonnaise" , "rice"
|
|
],
|
|
"s-singular": [
|
|
"acropolis" , "caddis" , "dais" , "glottis" , "pathos" ,
|
|
"aegis" , "cannabis" , "digitalis" , "ibis" , "pelvis" ,
|
|
"alias" , "canvas" , "epidermis" , "lens" , "polis" ,
|
|
"asbestos" , "chaos" , "ethos" , "mantis" , "rhinoceros" ,
|
|
"bathos" , "cosmos" , "gas" , "marquis" , "sassafras" ,
|
|
"bias" , "glottis" , "metropolis" , "trellis"
|
|
],
|
|
"ex-ices": [
|
|
"codex" , "murex" , "silex"
|
|
],
|
|
"ex-ices*": [
|
|
"apex" , "index" , "pontifex" , "vertex" ,
|
|
"cortex" , "latex" , "simplex" , "vortex"
|
|
],
|
|
"um-a": [
|
|
"agendum" , "candelabrum", "desideratum" , "extremum" , "stratum" ,
|
|
"bacterium" , "datum" , "erratum" , "ovum"
|
|
],
|
|
"um-a*": [
|
|
"aquarium" , "emporium" , "maximum" , "optimum" , "stadium" ,
|
|
"compendium" , "enconium" , "medium" , "phylum" , "trapezium" ,
|
|
"consortium" , "gymnasium" , "memorandum" , "quantum" , "ultimatum" ,
|
|
"cranium" , "honorarium" , "millenium" , "rostrum" , "vacuum" ,
|
|
"curriculum" , "interregnum", "minimum" , "spectrum" , "velum" ,
|
|
"dictum" , "lustrum" , "momentum" , "speculum"
|
|
],
|
|
"on-a": [
|
|
"aphelion" , "hyperbaton" , "perihelion" ,
|
|
"asyndeton" , "noumenon" , "phenomenon" ,
|
|
"criterion" , "organon" , "prolegomenon"
|
|
],
|
|
"a-ae": [
|
|
"alga" , "alumna" , "vertebra"
|
|
],
|
|
"a-ae*": [
|
|
"abscissa" , "aurora" , "hyperbola" , "nebula" ,
|
|
"amoeba" , "formula" , "lacuna" , "nova" ,
|
|
"antenna" , "hydra" , "medusa" , "parabola"
|
|
],
|
|
"en-ina*": [
|
|
"foramen" , "lumen" , "stamen"
|
|
],
|
|
"a-ata*": [
|
|
"anathema" , "dogma" , "gumma" , "miasma" , "stigma" ,
|
|
"bema" , "drama" , "lemma" , "schema" , "stoma" ,
|
|
"carcinoma" , "edema" , "lymphoma" , "oedema" , "trauma" ,
|
|
"charisma" , "enema" , "magma" , "sarcoma" ,
|
|
"diploma" , "enigma" , "melisma" , "soma" ,
|
|
],
|
|
"is-ides*": [
|
|
"clitoris" , "iris"
|
|
],
|
|
"us-i*": [
|
|
"focus" , "nimbus" , "succubus" ,
|
|
"fungus" , "nucleolus" , "torus" ,
|
|
"genius" , "radius" , "umbilicus" ,
|
|
"incubus" , "stylus" , "uterus"
|
|
],
|
|
"us-us*": [
|
|
"apparatus" , "hiatus" , "plexus" , "status" ,
|
|
"cantus" , "impetus" , "prospectus" ,
|
|
"coitus" , "nexus" , "sinus" ,
|
|
],
|
|
"o-i*": [
|
|
"alto" , "canto" , "crescendo" , "soprano" ,
|
|
"basso" , "contralto" , "solo" , "tempo"
|
|
],
|
|
"-i*": [
|
|
"afreet" , "afrit" , "efreet"
|
|
],
|
|
"-im*": [
|
|
"cherub" , "goy" , "seraph"
|
|
],
|
|
"o-os": [
|
|
"albino" , "dynamo" , "guano" , "lumbago" , "photo" ,
|
|
"archipelago", "embryo" , "inferno" , "magneto" , "pro" ,
|
|
"armadillo" , "fiasco" , "jumbo" , "manifesto" , "quarto" ,
|
|
"commando" , "generalissimo", "medico" , "rhino" ,
|
|
"ditto" , "ghetto" , "lingo" , "octavo" , "stylo"
|
|
],
|
|
"general-generals": [
|
|
"Adjutant" , "Brigadier" , "Lieutenant" , "Major" , "Quartermaster",
|
|
"adjutant" , "brigadier" , "lieutenant" , "major" , "quartermaster"
|
|
]
|
|
}
|
|
|
|
|
|
def pluralize(word, pos=NOUN, custom={}, classical=True):
|
|
""" Returns the plural of a given word, e.g., child => children.
|
|
Handles nouns and adjectives, using classical inflection by default
|
|
(i.e., where "matrix" pluralizes to "matrices" and not "matrixes").
|
|
The custom dictionary is for user-defined replacements.
|
|
"""
|
|
if word in custom:
|
|
return custom[word]
|
|
# Recurse genitives.
|
|
# Remove the apostrophe and any trailing -s,
|
|
# form the plural of the resultant noun, and then append an apostrophe (dog's => dogs').
|
|
if word.endswith(("'", "'s")):
|
|
w = word.rstrip("'s")
|
|
w = pluralize(w, pos, custom, classical)
|
|
if w.endswith("s"):
|
|
return w + "'"
|
|
else:
|
|
return w + "'s"
|
|
# Recurse compound words
|
|
# (e.g., Postmasters General, mothers-in-law, Roman deities).
|
|
w = word.replace("-", " ").split(" ")
|
|
if len(w) > 1:
|
|
if w[1] == "general" or \
|
|
w[1] == "General" and \
|
|
w[0] not in plural_categories["general-generals"]:
|
|
return word.replace(w[0], pluralize(w[0], pos, custom, classical))
|
|
elif w[1] in plural_prepositions:
|
|
return word.replace(w[0], pluralize(w[0], pos, custom, classical))
|
|
else:
|
|
return word.replace(w[-1], pluralize(w[-1], pos, custom, classical))
|
|
# Only a very few number of adjectives inflect.
|
|
n = range(len(plural_rules))
|
|
if pos.startswith(ADJECTIVE):
|
|
n = [0, 1]
|
|
# Apply pluralization rules.
|
|
for i in n:
|
|
for suffix, inflection, category, classic in plural_rules[i]:
|
|
# A general rule, or a classic rule in classical mode.
|
|
if category is None:
|
|
if not classic or (classic and classical):
|
|
if suffix.search(word) is not None:
|
|
return suffix.sub(inflection, word)
|
|
# A rule pertaining to a specific category of words.
|
|
if category is not None:
|
|
if word in plural_categories[category] and (not classic or (classic and classical)):
|
|
if suffix.search(word) is not None:
|
|
return suffix.sub(inflection, word)
|
|
return word
|
|
|
|
#print pluralize("part-of-speech")
|
|
#print pluralize("child")
|
|
#print pluralize("dog's")
|
|
#print pluralize("wolf")
|
|
#print pluralize("bear")
|
|
#print pluralize("kitchen knife")
|
|
#print pluralize("octopus", classical=True)
|
|
#print pluralize("matrix", classical=True)
|
|
#print pluralize("matrix", classical=False)
|
|
#print pluralize("my", pos=ADJECTIVE)
|
|
|
|
#### SINGULARIZE ###################################################################################
|
|
# Adapted from Bermi Ferrer's Inflector for Python:
|
|
# http://www.bermi.org/inflector/
|
|
|
|
# Copyright (c) 2006 Bermi Ferrer Martinez
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
# of this software to deal in this software without restriction, including
|
|
# without limitation the rights to use, copy, modify, merge, publish,
|
|
# distribute, sublicense, and/or sell copies of this software, and to permit
|
|
# persons to whom this software is furnished to do so, subject to the following
|
|
# condition:
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
# THIS SOFTWARE.
|
|
|
|
singular_rules = [
|
|
(r'(?i)(.)ae$' , '\\1a' ),
|
|
(r'(?i)(.)itis$' , '\\1itis' ),
|
|
(r'(?i)(.)eaux$' , '\\1eau' ),
|
|
(r'(?i)(quiz)zes$' , '\\1' ),
|
|
(r'(?i)(matr)ices$' , '\\1ix' ),
|
|
(r'(?i)(ap|vert|ind)ices$', '\\1ex' ),
|
|
(r'(?i)^(ox)en' , '\\1' ),
|
|
(r'(?i)(alias|status)es$' , '\\1' ),
|
|
(r'(?i)([octop|vir])i$' , '\\1us' ),
|
|
(r'(?i)(cris|ax|test)es$' , '\\1is' ),
|
|
(r'(?i)(shoe)s$' , '\\1' ),
|
|
(r'(?i)(o)es$' , '\\1' ),
|
|
(r'(?i)(bus)es$' , '\\1' ),
|
|
(r'(?i)([m|l])ice$' , '\\1ouse' ),
|
|
(r'(?i)(x|ch|ss|sh)es$' , '\\1' ),
|
|
(r'(?i)(m)ovies$' , '\\1ovie' ),
|
|
(r'(?i)(.)ombies$' , '\\1ombie'),
|
|
(r'(?i)(s)eries$' , '\\1eries'),
|
|
(r'(?i)([^aeiouy]|qu)ies$', '\\1y' ),
|
|
# -f, -fe sometimes take -ves in the plural
|
|
# (e.g., lives, wolves).
|
|
(r"([aeo]l)ves$" , "\\1f" ),
|
|
(r"([^d]ea)ves$" , "\\1f" ),
|
|
(r"arves$" , "arf" ),
|
|
(r"erves$" , "erve" ),
|
|
(r"([nlw]i)ves$" , "\\1fe" ),
|
|
(r'(?i)([lr])ves$' , '\\1f' ),
|
|
(r"([aeo])ves$" , "\\1ve" ),
|
|
(r'(?i)(sive)s$' , '\\1' ),
|
|
(r'(?i)(tive)s$' , '\\1' ),
|
|
(r'(?i)(hive)s$' , '\\1' ),
|
|
(r'(?i)([^f])ves$' , '\\1fe' ),
|
|
# -ses suffixes.
|
|
(r'(?i)(^analy)ses$' , '\\1sis' ),
|
|
(r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'),
|
|
(r'(?i)(.)opses$' , '\\1opsis'),
|
|
(r'(?i)(.)yses$' , '\\1ysis' ),
|
|
(r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'),
|
|
(r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'),
|
|
(r'(?i)(.)oses$' , '\\1osis' ),
|
|
# -a
|
|
(r'(?i)([ti])a$' , '\\1um' ),
|
|
(r'(?i)(n)ews$' , '\\1ews' ),
|
|
(r'(?i)s$' , '' ),
|
|
]
|
|
|
|
# For performance, compile the regular expressions only once:
|
|
singular_rules = [(re.compile(r[0]), r[1]) for r in singular_rules]
|
|
|
|
singular_uninflected = set((
|
|
"bison" , "debris" , "headquarters", "pincers" , "trout" ,
|
|
"bream" , "diabetes" , "herpes" , "pliers" , "tuna" ,
|
|
"breeches" , "djinn" , "high-jinks" , "proceedings", "whiting" ,
|
|
"britches" , "eland" , "homework" , "rabies" , "wildebeest",
|
|
"carp" , "elk" , "innings" , "salmon" ,
|
|
"chassis" , "flounder" , "jackanapes" , "scissors" ,
|
|
"christmas" , "gallows" , "mackerel" , "series" ,
|
|
"clippers" , "georgia" , "measles" , "shears" ,
|
|
"cod" , "graffiti" , "mews" , "species" ,
|
|
"contretemps", "mumps" , "swine" ,
|
|
"corps" , "news" , "swiss" ,
|
|
))
|
|
singular_uncountable = set((
|
|
"advice" , "equipment", "happiness" , "luggage" , "news" , "software" ,
|
|
"bread" , "fruit" , "information" , "mathematics", "progress" , "understanding",
|
|
"butter" , "furniture", "ketchup" , "mayonnaise" , "research" , "water" ,
|
|
"cheese" , "garbage" , "knowledge" , "meat" , "rice" ,
|
|
"electricity", "gravel" , "love" , "mustard" , "sand" ,
|
|
))
|
|
singular_ie = set((
|
|
"alergie" , "cutie" , "hoagie" , "newbie" , "softie" , "veggie" ,
|
|
"auntie" , "doggie" , "hottie" , "nightie" , "sortie" , "weenie" ,
|
|
"beanie" , "eyrie" , "indie" , "oldie" , "stoolie" , "yuppie" ,
|
|
"birdie" , "freebie" , "junkie" , "^pie" , "sweetie" , "zombie" ,
|
|
"bogie" , "goonie" , "laddie" , "pixie" , "techie" ,
|
|
"bombie" , "groupie" , "laramie" , "quickie" , "^tie" ,
|
|
"collie" , "hankie" , "lingerie" , "reverie" , "toughie" ,
|
|
"cookie" , "hippie" , "meanie" , "rookie" , "valkyrie" ,
|
|
))
|
|
singular_irregular = {
|
|
"atlantes": "atlas",
|
|
"atlases": "atlas",
|
|
"axes": "axe",
|
|
"beeves": "beef",
|
|
"brethren": "brother",
|
|
"children": "child",
|
|
"corpora": "corpus",
|
|
"corpuses": "corpus",
|
|
"ephemerides": "ephemeris",
|
|
"feet": "foot",
|
|
"ganglia": "ganglion",
|
|
"geese": "goose",
|
|
"genera": "genus",
|
|
"genii": "genie",
|
|
"graffiti": "graffito",
|
|
"helves": "helve",
|
|
"kine": "cow",
|
|
"leaves": "leaf",
|
|
"loaves": "loaf",
|
|
"men": "man",
|
|
"mongooses": "mongoose",
|
|
"monies": "money",
|
|
"moves": "move",
|
|
"mythoi": "mythos",
|
|
"numena": "numen",
|
|
"occipita": "occiput",
|
|
"octopodes": "octopus",
|
|
"opera": "opus",
|
|
"opuses": "opus",
|
|
"our": "my",
|
|
"oxen": "ox",
|
|
"penes": "penis",
|
|
"penises": "penis",
|
|
"people": "person",
|
|
"sexes": "sex",
|
|
"soliloquies": "soliloquy",
|
|
"teeth": "tooth",
|
|
"testes": "testis",
|
|
"trilbys": "trilby",
|
|
"turves": "turf",
|
|
"zoa": "zoon",
|
|
}
|
|
|
|
|
|
def singularize(word, pos=NOUN, custom={}):
|
|
""" Returns the singular of a given word.
|
|
"""
|
|
if word in custom:
|
|
return custom[word]
|
|
# Recurse compound words (e.g. mothers-in-law).
|
|
if "-" in word:
|
|
w = word.split("-")
|
|
if len(w) > 1 and w[1] in plural_prepositions:
|
|
return singularize(w[0], pos, custom) + "-" + "-".join(w[1:])
|
|
# dogs' => dog's
|
|
if word.endswith("'"):
|
|
return singularize(word[:-1]) + "'s"
|
|
w = word.lower()
|
|
for x in singular_uninflected:
|
|
if x.endswith(w):
|
|
return word
|
|
for x in singular_uncountable:
|
|
if x.endswith(w):
|
|
return word
|
|
for x in singular_ie:
|
|
if w.endswith(x + "s"):
|
|
return w
|
|
for x in singular_irregular:
|
|
if w.endswith(x):
|
|
return re.sub('(?i)' + x + '$', singular_irregular[x], word)
|
|
for suffix, inflection in singular_rules:
|
|
m = suffix.search(word)
|
|
g = m and m.groups() or []
|
|
if m:
|
|
for k in range(len(g)):
|
|
if g[k] is None:
|
|
inflection = inflection.replace('\\' + str(k + 1), '')
|
|
return suffix.sub(inflection, word)
|
|
return word
|
|
|
|
#### VERB CONJUGATION ##############################################################################
|
|
|
|
|
|
class Verbs(_Verbs):
|
|
|
|
def __init__(self):
|
|
_Verbs.__init__(self, os.path.join(MODULE, "en-verbs.txt"),
|
|
language = "en",
|
|
format = [0, 1, 2, 3, 7, 8, 17, 18, 19, 23, 25, 24, 16, 9, 10, 11, 15, 33, 26, 27, 28, 32],
|
|
default = {
|
|
1: 0, 2: 0, 3: 0, 7: 0, # present singular => infinitive ("I walk")
|
|
4: 7, 5: 7, 6: 7, # present plural
|
|
17: 25, 18: 25, 19: 25, 23: 25, # past singular
|
|
20: 23, 21: 23, 22: 23, # past plural
|
|
9: 16, 10: 16, 11: 16, 15: 16, # present singular negated
|
|
12: 15, 13: 15, 14: 15, # present plural negated
|
|
26: 33, 27: 33, 28: 33, # past singular negated
|
|
29: 32, 30: 32, 31: 32, 32: 33 # past plural negated
|
|
})
|
|
|
|
def find_lemma(self, verb):
|
|
""" Returns the base form of the given inflected verb, using a rule-based approach.
|
|
This is problematic if a verb ending in -e is given in the past tense or gerund.
|
|
"""
|
|
v = verb.lower()
|
|
b = False
|
|
if v in ("'m", "'re", "'s", "n't"):
|
|
return "be"
|
|
if v in ("'d", "'ll"):
|
|
return "will"
|
|
if v in ("'ve"):
|
|
return "have"
|
|
if v.endswith("s"):
|
|
if v.endswith("ies") and len(v) > 3 and v[-4] not in VOWELS:
|
|
return v[:-3] + "y" # complies => comply
|
|
if v.endswith(("sses", "shes", "ches", "xes")):
|
|
return v[:-2] # kisses => kiss
|
|
return v[:-1]
|
|
if v.endswith("ied") and re_vowel.search(v[:-3]) is not None:
|
|
return v[:-3] + "y" # envied => envy
|
|
if v.endswith("ing") and re_vowel.search(v[:-3]) is not None:
|
|
v = v[:-3]; b = True; # chopping => chopp
|
|
if v.endswith("ed") and re_vowel.search(v[:-2]) is not None:
|
|
v = v[:-2]; b = True; # danced => danc
|
|
if b:
|
|
# Doubled consonant after short vowel: chopp => chop.
|
|
if len(v) > 3 and v[-1] == v[-2] and v[-3] in VOWELS and v[-4] not in VOWELS and not v.endswith("ss"):
|
|
return v[:-1]
|
|
if v.endswith(("ick", "ack")):
|
|
return v[:-1] # panick => panic
|
|
# Guess common cases where the base form ends in -e:
|
|
if v.endswith(("v", "z", "c", "i")):
|
|
return v + "e" # danc => dance
|
|
if v.endswith("g") and v.endswith(("dg", "lg", "ng", "rg")):
|
|
return v + "e" # indulg => indulge
|
|
if v.endswith(("b", "d", "g", "k", "l", "m", "r", "s", "t")) \
|
|
and len(v) > 2 and v[-2] in VOWELS and not v[-3] in VOWELS \
|
|
and not v.endswith("er"):
|
|
return v + "e" # generat => generate
|
|
if v.endswith("n") and v.endswith(("an", "in")) and not v.endswith(("ain", "oin", "oan")):
|
|
return v + "e" # imagin => imagine
|
|
if v.endswith("l") and len(v) > 1 and v[-2] not in VOWELS:
|
|
return v + "e" # squabbl => squabble
|
|
if v.endswith("f") and len(v) > 2 and v[-2] in VOWELS and v[-3] not in VOWELS:
|
|
return v + "e" # chaf => chafed
|
|
if v.endswith("e"):
|
|
return v + "e" # decre => decree
|
|
if v.endswith(("th", "ang", "un", "cr", "vr", "rs", "ps", "tr")):
|
|
return v + "e"
|
|
return v
|
|
|
|
def find_lexeme(self, verb):
|
|
""" For a regular verb (base form), returns the forms using a rule-based approach.
|
|
"""
|
|
v = verb.lower()
|
|
if len(v) > 1 and v.endswith("e") and v[-2] not in VOWELS:
|
|
# Verbs ending in a consonant followed by "e": dance, save, devote, evolve.
|
|
return [v, v, v, v + "s", v, v[:-1] + "ing"] + [v + "d"] * 6
|
|
if len(v) > 1 and v.endswith("y") and v[-2] not in VOWELS:
|
|
# Verbs ending in a consonant followed by "y": comply, copy, magnify.
|
|
return [v, v, v, v[:-1] + "ies", v, v + "ing"] + [v[:-1] + "ied"] * 6
|
|
if v.endswith(("ss", "sh", "ch", "x")):
|
|
# Verbs ending in sibilants: kiss, bless, box, polish, preach.
|
|
return [v, v, v, v + "es", v, v + "ing"] + [v + "ed"] * 6
|
|
if v.endswith("ic"):
|
|
# Verbs ending in -ic: panic, mimic.
|
|
return [v, v, v, v + "es", v, v + "king"] + [v + "ked"] * 6
|
|
if len(v) > 1 and v[-1] not in VOWELS and v[-2] not in VOWELS:
|
|
# Verbs ending in a consonant cluster: delight, clamp.
|
|
return [v, v, v, v + "s", v, v + "ing"] + [v + "ed"] * 6
|
|
if (len(v) > 1 and v.endswith(("y", "w")) and v[-2] in VOWELS) \
|
|
or (len(v) > 2 and v[-1] not in VOWELS and v[-2] in VOWELS and v[-3] in VOWELS) \
|
|
or (len(v) > 3 and v[-1] not in VOWELS and v[-3] in VOWELS and v[-4] in VOWELS):
|
|
# Verbs ending in a long vowel or diphthong followed by a consonant: paint, devour, play.
|
|
return [v, v, v, v + "s", v, v + "ing"] + [v + "ed"] * 6
|
|
if len(v) > 2 and v[-1] not in VOWELS and v[-2] in VOWELS and v[-3] not in VOWELS:
|
|
# Verbs ending in a short vowel followed by a consonant: chat, chop, or compel.
|
|
return [v, v, v, v + "s", v, v + v[-1] + "ing"] + [v + v[-1] + "ed"] * 6
|
|
return [v, v, v, v + "s", v, v + "ing"] + [v + "ed"] * 6
|
|
|
|
verbs = Verbs()
|
|
|
|
conjugate, lemma, lexeme, tenses = \
|
|
verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses
|
|
|
|
#print conjugate("imaginarify", "part", parse=True)
|
|
#print conjugate("imaginarify", "part", parse=False)
|
|
|
|
#### COMPARATIVE & SUPERLATIVE #####################################################################
|
|
|
|
VOWELS = "aeiouy"
|
|
|
|
grade_irregular = {
|
|
"bad": ( "worse", "worst"),
|
|
"far": ("further", "farthest"),
|
|
"good": ( "better", "best"),
|
|
"hind": ( "hinder", "hindmost"),
|
|
"ill": ( "worse", "worst"),
|
|
"less": ( "lesser", "least"),
|
|
"little": ( "less", "least"),
|
|
"many": ( "more", "most"),
|
|
"much": ( "more", "most"),
|
|
"well": ( "better", "best")
|
|
}
|
|
|
|
grade_uninflected = ["giant", "glib", "hurt", "known", "madly"]
|
|
|
|
COMPARATIVE = "er"
|
|
SUPERLATIVE = "est"
|
|
|
|
|
|
def _count_syllables(word):
|
|
""" Returns the estimated number of syllables in the word by counting vowel-groups.
|
|
"""
|
|
n = 0
|
|
p = False # True if the previous character was a vowel.
|
|
for ch in word.endswith("e") and word[:-1] or word:
|
|
v = ch in VOWELS
|
|
n += int(v and not p)
|
|
p = v
|
|
return n
|
|
|
|
|
|
def grade(adjective, suffix=COMPARATIVE):
|
|
""" Returns the comparative or superlative form of the given adjective.
|
|
"""
|
|
n = _count_syllables(adjective)
|
|
if adjective in grade_irregular:
|
|
# A number of adjectives inflect irregularly.
|
|
return grade_irregular[adjective][suffix != COMPARATIVE]
|
|
elif adjective in grade_uninflected:
|
|
# A number of adjectives don't inflect at all.
|
|
return "%s %s" % (suffix == COMPARATIVE and "more" or "most", adjective)
|
|
elif n <= 2 and adjective.endswith("e"):
|
|
# With one syllable and ending with an e: larger, wiser.
|
|
suffix = suffix.lstrip("e")
|
|
elif n == 1 and len(adjective) >= 3 \
|
|
and adjective[-1] not in VOWELS and adjective[-2] in VOWELS and adjective[-3] not in VOWELS:
|
|
# With one syllable ending with consonant-vowel-consonant: bigger, thinner.
|
|
if not adjective.endswith(("w")): # Exceptions: lower, newer.
|
|
suffix = adjective[-1] + suffix
|
|
elif n == 1:
|
|
# With one syllable ending with more consonants or vowels: briefer.
|
|
pass
|
|
elif n == 2 and adjective.endswith("y"):
|
|
# With two syllables ending with a y: funnier, hairier.
|
|
adjective = adjective[:-1] + "i"
|
|
elif n == 2 and adjective[-2:] in ("er", "le", "ow"):
|
|
# With two syllables and specific suffixes: gentler, narrower.
|
|
pass
|
|
else:
|
|
# With three or more syllables: more generous, more important.
|
|
return "%s %s" % (suffix == COMPARATIVE and "more" or "most", adjective)
|
|
return adjective + suffix
|
|
|
|
|
|
def comparative(adjective):
|
|
return grade(adjective, COMPARATIVE)
|
|
|
|
|
|
def superlative(adjective):
|
|
return grade(adjective, SUPERLATIVE)
|
|
|
|
#### ATTRIBUTIVE & PREDICATIVE #####################################################################
|
|
|
|
|
|
def attributive(adjective):
|
|
return adjective
|
|
|
|
|
|
def predicative(adjective):
|
|
return adjective
|