You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

830 lines
35 KiB
Python

5 years ago
#### PATTERN | EN | INFLECT ########################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
####################################################################################################
# Regular expressions-based rules for English word inflection:
# - pluralization and singularization of nouns and adjectives,
# - conjugation of verbs,
# - comparative and superlative of adjectives.
# Accuracy (measured on CELEX English morphology word forms):
# 95% for pluralize()
# 96% for singularize()
# 95% for Verbs.find_lemma() (for regular verbs)
# 96% for Verbs.find_lexeme() (for regular verbs)
from __future__ import unicode_literals
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
import os
import sys
import re
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))
from pattern.text import Verbs as _Verbs
from pattern.text import (
INFINITIVE, PRESENT, PAST, FUTURE,
FIRST, SECOND, THIRD,
SINGULAR, PLURAL, SG, PL,
PROGRESSIVE,
PARTICIPLE
)
sys.path.pop(0)
VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
VOWELS = "aeiouy"
re_vowel = re.compile(r"a|e|i|o|u|y", re.I)
is_vowel = lambda ch: ch in VOWELS
#### ARTICLE #######################################################################################
# Based on the Ruby Linguistics module by Michael Granger:
# http://www.deveiate.org/projects/Linguistics/wiki/English
RE_ARTICLE = list(map(lambda x: (re.compile(x[0]), x[1]), (
(r"euler|hour(?!i)|heir|honest|hono", "an"), # exceptions: an hour, an honor
# Abbreviations:
# strings of capitals starting with a vowel-sound consonant followed by another consonant,
# which are not likely to be real words.
(r"(?!FJO|[HLMNS]Y.|RY[EO]|SQU|(F[LR]?|[HL]|MN?|N|RH?|S[CHKLMNPTVW]?|X(YL)?)[AEIOU])[FHLMNRSX][A-Z]", "an"),
(r"^[aefhilmnorsx][.-]" , "an"), # hyphenated: an f-16, an e-mail
(r"^[a-z][.-]" , "a" ), # hyphenated: a b-52
(r"^[^aeiouy]" , "a" ), # consonants: a bear
(r"^e[uw]" , "a" ), # -eu like "you": a european
(r"^onc?e" , "a" ), # -o like "wa" : a one-liner
(r"uni([^nmd]|mo)" , "a" ), # -u like "you": a university
(r"^u[bcfhjkqrst][aeiou]", "a" ), # -u like "you": a uterus
(r"^[aeiou]" , "an"), # vowels: an owl
(r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"), # y like "i": an yclept, a year
(r"" , "a" ) # guess "a"
)))
def definite_article(word):
return "the"
def indefinite_article(word):
""" Returns the indefinite article for a given word.
For example: indefinite_article("university") => "a" university.
"""
word = word.split(" ")[0]
for rule, article in RE_ARTICLE:
if rule.search(word) is not None:
return article
DEFINITE, INDEFINITE = \
"definite", "indefinite"
def article(word, function=INDEFINITE):
""" Returns the indefinite (a or an) or definite (the) article for the given word.
"""
return function == DEFINITE and definite_article(word) or indefinite_article(word)
_article = article
def referenced(word, article=INDEFINITE):
""" Returns a string with the article + the word.
"""
return "%s %s" % (_article(word, article), word)
#print referenced("hour")
#print referenced("FBI")
#print referenced("bear")
#print referenced("one-liner")
#print referenced("european")
#print referenced("university")
#print referenced("uterus")
#print referenced("owl")
#print referenced("yclept")
#print referenced("year")
#### PLURALIZE #####################################################################################
# Based on "An Algorithmic Approach to English Pluralization" by Damian Conway:
# http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
# Prepositions are used in forms like "mother-in-law" and "man at arms".
plural_prepositions = set((
"about" , "before" , "during", "of" , "till" ,
"above" , "behind" , "except", "off" , "to" ,
"across" , "below" , "for" , "on" , "under",
"after" , "beneath", "from" , "onto" , "until",
"among" , "beside" , "in" , "out" , "unto" ,
"around" , "besides", "into" , "over" , "upon" ,
"at" , "between", "near" , "since", "with" ,
"athwart", "betwixt",
"beyond",
"but",
"by"))
# Inflection rules that are either:
# - general,
# - apply to a certain category of words,
# - apply to a certain category of words only in classical mode,
# - apply only in classical mode.
# Each rule is a (suffix, inflection, category, classic)-tuple.
plural_rules = [
# 0) Indefinite articles and demonstratives.
(( r"^a$|^an$", "some" , None, False),
( r"^this$", "these" , None, False),
( r"^that$", "those" , None, False),
( r"^any$", "all" , None, False)
), # 1) Possessive adjectives.
(( r"^my$", "our" , None, False),
( r"^your$", "your" , None, False),
( r"^thy$", "your" , None, False),
(r"^her$|^his$", "their" , None, False),
( r"^its$", "their" , None, False),
( r"^their$", "their" , None, False)
), # 2) Possessive pronouns.
(( r"^mine$", "ours" , None, False),
( r"^yours$", "yours" , None, False),
( r"^thine$", "yours" , None, False),
(r"^her$|^his$", "theirs" , None, False),
( r"^its$", "theirs" , None, False),
( r"^their$", "theirs" , None, False)
), # 3) Personal pronouns.
(( r"^I$", "we" , None, False),
( r"^me$", "us" , None, False),
( r"^myself$", "ourselves" , None, False),
( r"^you$", "you" , None, False),
(r"^thou$|^thee$", "ye" , None, False),
( r"^yourself$", "yourself" , None, False),
( r"^thyself$", "yourself" , None, False),
( r"^she$|^he$", "they" , None, False),
(r"^it$|^they$", "they" , None, False),
(r"^her$|^him$", "them" , None, False),
(r"^it$|^them$", "them" , None, False),
( r"^herself$", "themselves" , None, False),
( r"^himself$", "themselves" , None, False),
( r"^itself$", "themselves" , None, False),
( r"^themself$", "themselves" , None, False),
( r"^oneself$", "oneselves" , None, False)
), # 4) Words that do not inflect.
(( r"$", "" , "uninflected", False),
( r"$", "" , "uncountable", False),
( r"s$", "s" , "s-singular" , False),
( r"fish$", "fish" , None, False),
(r"([- ])bass$", "\\1bass" , None, False),
( r"ois$", "ois" , None, False),
( r"sheep$", "sheep" , None, False),
( r"deer$", "deer" , None, False),
( r"pox$", "pox" , None, False),
(r"([A-Z].*)ese$", "\\1ese" , None, False),
( r"itis$", "itis" , None, False),
(r"(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose", None, False)
), # 5) Irregular plural forms (e.g., mongoose, oxen).
(( r"atlas$", "atlantes" , None, True ),
( r"atlas$", "atlases" , None, False),
( r"beef$", "beeves" , None, True ),
( r"brother$", "brethren" , None, True ),
( r"child$", "children" , None, False),
( r"corpus$", "corpora" , None, True ),
( r"corpus$", "corpuses" , None, False),
( r"^cow$", "kine" , None, True ),
( r"ephemeris$", "ephemerides", None, False),
( r"ganglion$", "ganglia" , None, True ),
( r"genie$", "genii" , None, True ),
( r"genus$", "genera" , None, False),
( r"graffito$", "graffiti" , None, False),
( r"loaf$", "loaves" , None, False),
( r"money$", "monies" , None, True ),
( r"mongoose$", "mongooses" , None, False),
( r"mythos$", "mythoi" , None, False),
( r"octopus$", "octopodes" , None, True ),
( r"opus$", "opera" , None, True ),
( r"opus$", "opuses" , None, False),
( r"^ox$", "oxen" , None, False),
( r"penis$", "penes" , None, True ),
( r"penis$", "penises" , None, False),
( r"soliloquy$", "soliloquies", None, False),
( r"testis$", "testes" , None, False),
( r"trilby$", "trilbys" , None, False),
( r"turf$", "turves" , None, True ),
( r"numen$", "numena" , None, False),
( r"occiput$", "occipita" , None, True )
), # 6) Irregular inflections for common suffixes (e.g., synopses, mice, men).
(( r"man$", "men" , None, False),
( r"person$", "people" , None, False),
(r"([lm])ouse$", "\\1ice" , None, False),
( r"tooth$", "teeth" , None, False),
( r"goose$", "geese" , None, False),
( r"foot$", "feet" , None, False),
( r"zoon$", "zoa" , None, False),
( r"([csx])is$", "\\1es" , None, False)
), # 7) Fully assimilated classical inflections
# (e.g., vertebrae, codices).
(( r"ex$", "ices" , "ex-ices" , False),
( r"ex$", "ices" , "ex-ices*", True ), # * = classical mode
( r"um$", "a" , "um-a" , False),
( r"um$", "a" , "um-a*", True ),
( r"on$", "a" , "on-a" , False),
( r"a$", "ae" , "a-ae" , False),
( r"a$", "ae" , "a-ae*", True )
), # 8) Classical variants of modern inflections
# (e.g., stigmata, soprani).
(( r"trix$", "trices" , None, True),
( r"eau$", "eaux" , None, True),
( r"ieu$", "ieu" , None, True),
( r"([iay])nx$", "\\1nges" , None, True),
( r"en$", "ina" , "en-ina*", True),
( r"a$", "ata" , "a-ata*", True),
( r"is$", "ides" , "is-ides*", True),
( r"us$", "i" , "us-i*", True),
( r"us$", "us " , "us-us*", True),
( r"o$", "i" , "o-i*", True),
( r"$", "i" , "-i*", True),
( r"$", "im" , "-im*", True)
), # 9) -ch, -sh and -ss take -es in the plural
# (e.g., churches, classes).
(( r"([cs])h$", "\\1hes" , None, False),
( r"ss$", "sses" , None, False),
( r"x$", "xes" , None, False)
), # 10) -f or -fe sometimes take -ves in the plural
# (e.g, lives, wolves).
(( r"([aeo]l)f$", "\\1ves" , None, False),
( r"([^d]ea)f$", "\\1ves" , None, False),
( r"arf$", "arves" , None, False),
(r"([nlw]i)fe$", "\\1ves" , None, False),
), # 11) -y takes -ys if preceded by a vowel, -ies otherwise
# (e.g., storeys, Marys, stories).
((r"([aeiou])y$", "\\1ys" , None, False),
(r"([A-Z].*)y$", "\\1ys" , None, False),
( r"y$", "ies" , None, False)
), # 12) -o sometimes takes -os, -oes otherwise.
# -o is preceded by a vowel takes -os
# (e.g., lassos, potatoes, bamboos).
(( r"o$", "os", "o-os", False),
(r"([aeiou])o$", "\\1os" , None, False),
( r"o$", "oes" , None, False)
), # 13) Miltary stuff
# (e.g., Major Generals).
(( r"l$", "ls", "general-generals", False),
), # 14) Assume that the plural takes -s
# (cats, programmes, ...).
(( r"$", "s" , None, False),)
]
# For performance, compile the regular expressions once:
plural_rules = [[(re.compile(r[0]), r[1], r[2], r[3]) for r in grp] for grp in plural_rules]
# Suffix categories.
plural_categories = {
"uninflected": [
"bison" , "debris" , "headquarters" , "news" , "swine" ,
"bream" , "diabetes" , "herpes" , "pincers" , "trout" ,
"breeches" , "djinn" , "high-jinks" , "pliers" , "tuna" ,
"britches" , "eland" , "homework" , "proceedings", "whiting" ,
"carp" , "elk" , "innings" , "rabies" , "wildebeest" ,
"chassis" , "flounder" , "jackanapes" , "salmon" ,
"clippers" , "gallows" , "mackerel" , "scissors" ,
"cod" , "graffiti" , "measles" , "series" ,
"contretemps", "mews" , "shears" ,
"corps" , "mumps" , "species"
],
"uncountable": [
"advice" , "fruit" , "ketchup" , "meat" , "sand" ,
"bread" , "furniture" , "knowledge" , "mustard" , "software" ,
"butter" , "garbage" , "love" , "news" , "understanding",
"cheese" , "gravel" , "luggage" , "progress" , "water" ,
"electricity", "happiness" , "mathematics" , "research" ,
"equipment" , "information", "mayonnaise" , "rice"
],
"s-singular": [
"acropolis" , "caddis" , "dais" , "glottis" , "pathos" ,
"aegis" , "cannabis" , "digitalis" , "ibis" , "pelvis" ,
"alias" , "canvas" , "epidermis" , "lens" , "polis" ,
"asbestos" , "chaos" , "ethos" , "mantis" , "rhinoceros" ,
"bathos" , "cosmos" , "gas" , "marquis" , "sassafras" ,
"bias" , "glottis" , "metropolis" , "trellis"
],
"ex-ices": [
"codex" , "murex" , "silex"
],
"ex-ices*": [
"apex" , "index" , "pontifex" , "vertex" ,
"cortex" , "latex" , "simplex" , "vortex"
],
"um-a": [
"agendum" , "candelabrum", "desideratum" , "extremum" , "stratum" ,
"bacterium" , "datum" , "erratum" , "ovum"
],
"um-a*": [
"aquarium" , "emporium" , "maximum" , "optimum" , "stadium" ,
"compendium" , "enconium" , "medium" , "phylum" , "trapezium" ,
"consortium" , "gymnasium" , "memorandum" , "quantum" , "ultimatum" ,
"cranium" , "honorarium" , "millenium" , "rostrum" , "vacuum" ,
"curriculum" , "interregnum", "minimum" , "spectrum" , "velum" ,
"dictum" , "lustrum" , "momentum" , "speculum"
],
"on-a": [
"aphelion" , "hyperbaton" , "perihelion" ,
"asyndeton" , "noumenon" , "phenomenon" ,
"criterion" , "organon" , "prolegomenon"
],
"a-ae": [
"alga" , "alumna" , "vertebra"
],
"a-ae*": [
"abscissa" , "aurora" , "hyperbola" , "nebula" ,
"amoeba" , "formula" , "lacuna" , "nova" ,
"antenna" , "hydra" , "medusa" , "parabola"
],
"en-ina*": [
"foramen" , "lumen" , "stamen"
],
"a-ata*": [
"anathema" , "dogma" , "gumma" , "miasma" , "stigma" ,
"bema" , "drama" , "lemma" , "schema" , "stoma" ,
"carcinoma" , "edema" , "lymphoma" , "oedema" , "trauma" ,
"charisma" , "enema" , "magma" , "sarcoma" ,
"diploma" , "enigma" , "melisma" , "soma" ,
],
"is-ides*": [
"clitoris" , "iris"
],
"us-i*": [
"focus" , "nimbus" , "succubus" ,
"fungus" , "nucleolus" , "torus" ,
"genius" , "radius" , "umbilicus" ,
"incubus" , "stylus" , "uterus"
],
"us-us*": [
"apparatus" , "hiatus" , "plexus" , "status" ,
"cantus" , "impetus" , "prospectus" ,
"coitus" , "nexus" , "sinus" ,
],
"o-i*": [
"alto" , "canto" , "crescendo" , "soprano" ,
"basso" , "contralto" , "solo" , "tempo"
],
"-i*": [
"afreet" , "afrit" , "efreet"
],
"-im*": [
"cherub" , "goy" , "seraph"
],
"o-os": [
"albino" , "dynamo" , "guano" , "lumbago" , "photo" ,
"archipelago", "embryo" , "inferno" , "magneto" , "pro" ,
"armadillo" , "fiasco" , "jumbo" , "manifesto" , "quarto" ,
"commando" , "generalissimo", "medico" , "rhino" ,
"ditto" , "ghetto" , "lingo" , "octavo" , "stylo"
],
"general-generals": [
"Adjutant" , "Brigadier" , "Lieutenant" , "Major" , "Quartermaster",
"adjutant" , "brigadier" , "lieutenant" , "major" , "quartermaster"
]
}
def pluralize(word, pos=NOUN, custom={}, classical=True):
""" Returns the plural of a given word, e.g., child => children.
Handles nouns and adjectives, using classical inflection by default
(i.e., where "matrix" pluralizes to "matrices" and not "matrixes").
The custom dictionary is for user-defined replacements.
"""
if word in custom:
return custom[word]
# Recurse genitives.
# Remove the apostrophe and any trailing -s,
# form the plural of the resultant noun, and then append an apostrophe (dog's => dogs').
if word.endswith(("'", "'s")):
w = word.rstrip("'s")
w = pluralize(w, pos, custom, classical)
if w.endswith("s"):
return w + "'"
else:
return w + "'s"
# Recurse compound words
# (e.g., Postmasters General, mothers-in-law, Roman deities).
w = word.replace("-", " ").split(" ")
if len(w) > 1:
if w[1] == "general" or \
w[1] == "General" and \
w[0] not in plural_categories["general-generals"]:
return word.replace(w[0], pluralize(w[0], pos, custom, classical))
elif w[1] in plural_prepositions:
return word.replace(w[0], pluralize(w[0], pos, custom, classical))
else:
return word.replace(w[-1], pluralize(w[-1], pos, custom, classical))
# Only a very few number of adjectives inflect.
n = range(len(plural_rules))
if pos.startswith(ADJECTIVE):
n = [0, 1]
# Apply pluralization rules.
for i in n:
for suffix, inflection, category, classic in plural_rules[i]:
# A general rule, or a classic rule in classical mode.
if category is None:
if not classic or (classic and classical):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
# A rule pertaining to a specific category of words.
if category is not None:
if word in plural_categories[category] and (not classic or (classic and classical)):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
return word
#print pluralize("part-of-speech")
#print pluralize("child")
#print pluralize("dog's")
#print pluralize("wolf")
#print pluralize("bear")
#print pluralize("kitchen knife")
#print pluralize("octopus", classical=True)
#print pluralize("matrix", classical=True)
#print pluralize("matrix", classical=False)
#print pluralize("my", pos=ADJECTIVE)
#### SINGULARIZE ###################################################################################
# Adapted from Bermi Ferrer's Inflector for Python:
# http://www.bermi.org/inflector/
# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.
singular_rules = [
(r'(?i)(.)ae$' , '\\1a' ),
(r'(?i)(.)itis$' , '\\1itis' ),
(r'(?i)(.)eaux$' , '\\1eau' ),
(r'(?i)(quiz)zes$' , '\\1' ),
(r'(?i)(matr)ices$' , '\\1ix' ),
(r'(?i)(ap|vert|ind)ices$', '\\1ex' ),
(r'(?i)^(ox)en' , '\\1' ),
(r'(?i)(alias|status)es$' , '\\1' ),
(r'(?i)([octop|vir])i$' , '\\1us' ),
(r'(?i)(cris|ax|test)es$' , '\\1is' ),
(r'(?i)(shoe)s$' , '\\1' ),
(r'(?i)(o)es$' , '\\1' ),
(r'(?i)(bus)es$' , '\\1' ),
(r'(?i)([m|l])ice$' , '\\1ouse' ),
(r'(?i)(x|ch|ss|sh)es$' , '\\1' ),
(r'(?i)(m)ovies$' , '\\1ovie' ),
(r'(?i)(.)ombies$' , '\\1ombie'),
(r'(?i)(s)eries$' , '\\1eries'),
(r'(?i)([^aeiouy]|qu)ies$', '\\1y' ),
# -f, -fe sometimes take -ves in the plural
# (e.g., lives, wolves).
(r"([aeo]l)ves$" , "\\1f" ),
(r"([^d]ea)ves$" , "\\1f" ),
(r"arves$" , "arf" ),
(r"erves$" , "erve" ),
(r"([nlw]i)ves$" , "\\1fe" ),
(r'(?i)([lr])ves$' , '\\1f' ),
(r"([aeo])ves$" , "\\1ve" ),
(r'(?i)(sive)s$' , '\\1' ),
(r'(?i)(tive)s$' , '\\1' ),
(r'(?i)(hive)s$' , '\\1' ),
(r'(?i)([^f])ves$' , '\\1fe' ),
# -ses suffixes.
(r'(?i)(^analy)ses$' , '\\1sis' ),
(r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$', '\\1\\2sis'),
(r'(?i)(.)opses$' , '\\1opsis'),
(r'(?i)(.)yses$' , '\\1ysis' ),
(r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'),
(r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$', '\\1ose'),
(r'(?i)(.)oses$' , '\\1osis' ),
# -a
(r'(?i)([ti])a$' , '\\1um' ),
(r'(?i)(n)ews$' , '\\1ews' ),
(r'(?i)s$' , '' ),
]
# For performance, compile the regular expressions only once:
singular_rules = [(re.compile(r[0]), r[1]) for r in singular_rules]
singular_uninflected = set((
"bison" , "debris" , "headquarters", "pincers" , "trout" ,
"bream" , "diabetes" , "herpes" , "pliers" , "tuna" ,
"breeches" , "djinn" , "high-jinks" , "proceedings", "whiting" ,
"britches" , "eland" , "homework" , "rabies" , "wildebeest",
"carp" , "elk" , "innings" , "salmon" ,
"chassis" , "flounder" , "jackanapes" , "scissors" ,
"christmas" , "gallows" , "mackerel" , "series" ,
"clippers" , "georgia" , "measles" , "shears" ,
"cod" , "graffiti" , "mews" , "species" ,
"contretemps", "mumps" , "swine" ,
"corps" , "news" , "swiss" ,
))
singular_uncountable = set((
"advice" , "equipment", "happiness" , "luggage" , "news" , "software" ,
"bread" , "fruit" , "information" , "mathematics", "progress" , "understanding",
"butter" , "furniture", "ketchup" , "mayonnaise" , "research" , "water" ,
"cheese" , "garbage" , "knowledge" , "meat" , "rice" ,
"electricity", "gravel" , "love" , "mustard" , "sand" ,
))
singular_ie = set((
"alergie" , "cutie" , "hoagie" , "newbie" , "softie" , "veggie" ,
"auntie" , "doggie" , "hottie" , "nightie" , "sortie" , "weenie" ,
"beanie" , "eyrie" , "indie" , "oldie" , "stoolie" , "yuppie" ,
"birdie" , "freebie" , "junkie" , "^pie" , "sweetie" , "zombie" ,
"bogie" , "goonie" , "laddie" , "pixie" , "techie" ,
"bombie" , "groupie" , "laramie" , "quickie" , "^tie" ,
"collie" , "hankie" , "lingerie" , "reverie" , "toughie" ,
"cookie" , "hippie" , "meanie" , "rookie" , "valkyrie" ,
))
singular_irregular = {
"atlantes": "atlas",
"atlases": "atlas",
"axes": "axe",
"beeves": "beef",
"brethren": "brother",
"children": "child",
"corpora": "corpus",
"corpuses": "corpus",
"ephemerides": "ephemeris",
"feet": "foot",
"ganglia": "ganglion",
"geese": "goose",
"genera": "genus",
"genii": "genie",
"graffiti": "graffito",
"helves": "helve",
"kine": "cow",
"leaves": "leaf",
"loaves": "loaf",
"men": "man",
"mongooses": "mongoose",
"monies": "money",
"moves": "move",
"mythoi": "mythos",
"numena": "numen",
"occipita": "occiput",
"octopodes": "octopus",
"opera": "opus",
"opuses": "opus",
"our": "my",
"oxen": "ox",
"penes": "penis",
"penises": "penis",
"people": "person",
"sexes": "sex",
"soliloquies": "soliloquy",
"teeth": "tooth",
"testes": "testis",
"trilbys": "trilby",
"turves": "turf",
"zoa": "zoon",
}
def singularize(word, pos=NOUN, custom={}):
""" Returns the singular of a given word.
"""
if word in custom:
return custom[word]
# Recurse compound words (e.g. mothers-in-law).
if "-" in word:
w = word.split("-")
if len(w) > 1 and w[1] in plural_prepositions:
return singularize(w[0], pos, custom) + "-" + "-".join(w[1:])
# dogs' => dog's
if word.endswith("'"):
return singularize(word[:-1]) + "'s"
w = word.lower()
for x in singular_uninflected:
if x.endswith(w):
return word
for x in singular_uncountable:
if x.endswith(w):
return word
for x in singular_ie:
if w.endswith(x + "s"):
return w
for x in singular_irregular:
if w.endswith(x):
return re.sub('(?i)' + x + '$', singular_irregular[x], word)
for suffix, inflection in singular_rules:
m = suffix.search(word)
g = m and m.groups() or []
if m:
for k in range(len(g)):
if g[k] is None:
inflection = inflection.replace('\\' + str(k + 1), '')
return suffix.sub(inflection, word)
return word
#### VERB CONJUGATION ##############################################################################
class Verbs(_Verbs):
def __init__(self):
_Verbs.__init__(self, os.path.join(MODULE, "en-verbs.txt"),
language = "en",
format = [0, 1, 2, 3, 7, 8, 17, 18, 19, 23, 25, 24, 16, 9, 10, 11, 15, 33, 26, 27, 28, 32],
default = {
1: 0, 2: 0, 3: 0, 7: 0, # present singular => infinitive ("I walk")
4: 7, 5: 7, 6: 7, # present plural
17: 25, 18: 25, 19: 25, 23: 25, # past singular
20: 23, 21: 23, 22: 23, # past plural
9: 16, 10: 16, 11: 16, 15: 16, # present singular negated
12: 15, 13: 15, 14: 15, # present plural negated
26: 33, 27: 33, 28: 33, # past singular negated
29: 32, 30: 32, 31: 32, 32: 33 # past plural negated
})
def find_lemma(self, verb):
""" Returns the base form of the given inflected verb, using a rule-based approach.
This is problematic if a verb ending in -e is given in the past tense or gerund.
"""
v = verb.lower()
b = False
if v in ("'m", "'re", "'s", "n't"):
return "be"
if v in ("'d", "'ll"):
return "will"
if v in ("'ve"):
return "have"
if v.endswith("s"):
if v.endswith("ies") and len(v) > 3 and v[-4] not in VOWELS:
return v[:-3] + "y" # complies => comply
if v.endswith(("sses", "shes", "ches", "xes")):
return v[:-2] # kisses => kiss
return v[:-1]
if v.endswith("ied") and re_vowel.search(v[:-3]) is not None:
return v[:-3] + "y" # envied => envy
if v.endswith("ing") and re_vowel.search(v[:-3]) is not None:
v = v[:-3]; b = True; # chopping => chopp
if v.endswith("ed") and re_vowel.search(v[:-2]) is not None:
v = v[:-2]; b = True; # danced => danc
if b:
# Doubled consonant after short vowel: chopp => chop.
if len(v) > 3 and v[-1] == v[-2] and v[-3] in VOWELS and v[-4] not in VOWELS and not v.endswith("ss"):
return v[:-1]
if v.endswith(("ick", "ack")):
return v[:-1] # panick => panic
# Guess common cases where the base form ends in -e:
if v.endswith(("v", "z", "c", "i")):
return v + "e" # danc => dance
if v.endswith("g") and v.endswith(("dg", "lg", "ng", "rg")):
return v + "e" # indulg => indulge
if v.endswith(("b", "d", "g", "k", "l", "m", "r", "s", "t")) \
and len(v) > 2 and v[-2] in VOWELS and not v[-3] in VOWELS \
and not v.endswith("er"):
return v + "e" # generat => generate
if v.endswith("n") and v.endswith(("an", "in")) and not v.endswith(("ain", "oin", "oan")):
return v + "e" # imagin => imagine
if v.endswith("l") and len(v) > 1 and v[-2] not in VOWELS:
return v + "e" # squabbl => squabble
if v.endswith("f") and len(v) > 2 and v[-2] in VOWELS and v[-3] not in VOWELS:
return v + "e" # chaf => chafed
if v.endswith("e"):
return v + "e" # decre => decree
if v.endswith(("th", "ang", "un", "cr", "vr", "rs", "ps", "tr")):
return v + "e"
return v
def find_lexeme(self, verb):
""" For a regular verb (base form), returns the forms using a rule-based approach.
"""
v = verb.lower()
if len(v) > 1 and v.endswith("e") and v[-2] not in VOWELS:
# Verbs ending in a consonant followed by "e": dance, save, devote, evolve.
return [v, v, v, v + "s", v, v[:-1] + "ing"] + [v + "d"] * 6
if len(v) > 1 and v.endswith("y") and v[-2] not in VOWELS:
# Verbs ending in a consonant followed by "y": comply, copy, magnify.
return [v, v, v, v[:-1] + "ies", v, v + "ing"] + [v[:-1] + "ied"] * 6
if v.endswith(("ss", "sh", "ch", "x")):
# Verbs ending in sibilants: kiss, bless, box, polish, preach.
return [v, v, v, v + "es", v, v + "ing"] + [v + "ed"] * 6
if v.endswith("ic"):
# Verbs ending in -ic: panic, mimic.
return [v, v, v, v + "es", v, v + "king"] + [v + "ked"] * 6
if len(v) > 1 and v[-1] not in VOWELS and v[-2] not in VOWELS:
# Verbs ending in a consonant cluster: delight, clamp.
return [v, v, v, v + "s", v, v + "ing"] + [v + "ed"] * 6
if (len(v) > 1 and v.endswith(("y", "w")) and v[-2] in VOWELS) \
or (len(v) > 2 and v[-1] not in VOWELS and v[-2] in VOWELS and v[-3] in VOWELS) \
or (len(v) > 3 and v[-1] not in VOWELS and v[-3] in VOWELS and v[-4] in VOWELS):
# Verbs ending in a long vowel or diphthong followed by a consonant: paint, devour, play.
return [v, v, v, v + "s", v, v + "ing"] + [v + "ed"] * 6
if len(v) > 2 and v[-1] not in VOWELS and v[-2] in VOWELS and v[-3] not in VOWELS:
# Verbs ending in a short vowel followed by a consonant: chat, chop, or compel.
return [v, v, v, v + "s", v, v + v[-1] + "ing"] + [v + v[-1] + "ed"] * 6
return [v, v, v, v + "s", v, v + "ing"] + [v + "ed"] * 6
verbs = Verbs()
conjugate, lemma, lexeme, tenses = \
verbs.conjugate, verbs.lemma, verbs.lexeme, verbs.tenses
#print conjugate("imaginarify", "part", parse=True)
#print conjugate("imaginarify", "part", parse=False)
#### COMPARATIVE & SUPERLATIVE #####################################################################
VOWELS = "aeiouy"
grade_irregular = {
"bad": ( "worse", "worst"),
"far": ("further", "farthest"),
"good": ( "better", "best"),
"hind": ( "hinder", "hindmost"),
"ill": ( "worse", "worst"),
"less": ( "lesser", "least"),
"little": ( "less", "least"),
"many": ( "more", "most"),
"much": ( "more", "most"),
"well": ( "better", "best")
}
grade_uninflected = ["giant", "glib", "hurt", "known", "madly"]
COMPARATIVE = "er"
SUPERLATIVE = "est"
def _count_syllables(word):
""" Returns the estimated number of syllables in the word by counting vowel-groups.
"""
n = 0
p = False # True if the previous character was a vowel.
for ch in word.endswith("e") and word[:-1] or word:
v = ch in VOWELS
n += int(v and not p)
p = v
return n
def grade(adjective, suffix=COMPARATIVE):
""" Returns the comparative or superlative form of the given adjective.
"""
n = _count_syllables(adjective)
if adjective in grade_irregular:
# A number of adjectives inflect irregularly.
return grade_irregular[adjective][suffix != COMPARATIVE]
elif adjective in grade_uninflected:
# A number of adjectives don't inflect at all.
return "%s %s" % (suffix == COMPARATIVE and "more" or "most", adjective)
elif n <= 2 and adjective.endswith("e"):
# With one syllable and ending with an e: larger, wiser.
suffix = suffix.lstrip("e")
elif n == 1 and len(adjective) >= 3 \
and adjective[-1] not in VOWELS and adjective[-2] in VOWELS and adjective[-3] not in VOWELS:
# With one syllable ending with consonant-vowel-consonant: bigger, thinner.
if not adjective.endswith(("w")): # Exceptions: lower, newer.
suffix = adjective[-1] + suffix
elif n == 1:
# With one syllable ending with more consonants or vowels: briefer.
pass
elif n == 2 and adjective.endswith("y"):
# With two syllables ending with a y: funnier, hairier.
adjective = adjective[:-1] + "i"
elif n == 2 and adjective[-2:] in ("er", "le", "ow"):
# With two syllables and specific suffixes: gentler, narrower.
pass
else:
# With three or more syllables: more generous, more important.
return "%s %s" % (suffix == COMPARATIVE and "more" or "most", adjective)
return adjective + suffix
def comparative(adjective):
return grade(adjective, COMPARATIVE)
def superlative(adjective):
return grade(adjective, SUPERLATIVE)
#### ATTRIBUTIVE & PREDICATIVE #####################################################################
def attributive(adjective):
return adjective
def predicative(adjective):
return adjective