You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

345 lines
9.5 KiB
Python

# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Lexicons
"""
from __future__ import unicode_literals
import re
from collections import defaultdict
from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
from nltk.compat import python_2_unicode_compatible
from nltk.internals import deprecated
from nltk.sem.logic import Expression
# ------------
# Regular expressions used for parsing components of the lexicon
# ------------
# Parses a primitive category and subscripts
PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
# Separates the next primitive category from the remainder of the
# string
NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
# Separates the next application operator from the remainder
APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
# Parses the definition of the right-hand side (rhs) of either a word or a family
LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
# Parses the right hand side that contains category and maybe semantic predicate
RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
# Parses the semantic predicate
SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
# Strips comments from a line
COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
class Token(object):
"""
Class representing a token.
token => category {semantics}
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
* `token` (string)
* `categ` (string)
* `semantics` (Expression)
"""
def __init__(self, token, categ, semantics=None):
self._token = token
self._categ = categ
self._semantics = semantics
def categ(self):
return self._categ
def semantics(self):
return self._semantics
def __str__(self):
semantics_str = ""
if self._semantics is not None:
semantics_str = " {" + str(self._semantics) + "}"
return "" + str(self._categ) + semantics_str
def __cmp__(self, other):
if not isinstance(other, Token):
return -1
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
@python_2_unicode_compatible
class CCGLexicon(object):
"""
Class representing a lexicon for CCG grammars.
* `primitives`: The list of primitive categories for the lexicon
* `families`: Families of categories
* `entries`: A mapping of words to possible categories
"""
def __init__(self, start, primitives, families, entries):
self._start = PrimitiveCategory(start)
self._primitives = primitives
self._families = families
self._entries = entries
def categories(self, word):
"""
Returns all the possible categories for a word
"""
return self._entries[word]
def start(self):
"""
Return the target category for the parser
"""
return self._start
def __str__(self):
"""
String representation of the lexicon. Used for debugging.
"""
string = ""
first = True
for ident in sorted(self._entries):
if not first:
string = string + "\n"
string = string + ident + " => "
first = True
for cat in self._entries[ident]:
if not first:
string = string + " | "
else:
first = False
string = string + "%s" % cat
return string
# -----------
# Parsing lexicons
# -----------
def matchBrackets(string):
"""
Separate the contents matching the first set of brackets from the rest of
the input.
"""
rest = string[1:]
inside = "("
while rest != "" and not rest.startswith(')'):
if rest.startswith('('):
(part, rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
if rest.startswith(')'):
return (inside + ')', rest[1:])
raise AssertionError('Unmatched bracket in string \'' + string + '\'')
def nextCategory(string):
"""
Separate the string for the next portion of the category from the rest
of the string
"""
if string.startswith('('):
return matchBrackets(string)
return NEXTPRIM_RE.match(string).groups()
def parseApplication(app):
"""
Parse an application operator
"""
return Direction(app[0], app[1:])
def parseSubscripts(subscr):
"""
Parse the subscripts for a primitive category
"""
if subscr:
return subscr[1:-1].split(',')
return []
def parsePrimitiveCategory(chunks, primitives, families, var):
"""
Parse a primitive category
If the primitive is the special category 'var', replace it with the
correct `CCGVar`.
"""
if chunks[0] == "var":
if chunks[1] is None:
if var is None:
var = CCGVar()
return (var, var)
catstr = chunks[0]
if catstr in families:
(cat, cvar) = families[catstr]
if var is None:
var = cvar
else:
cat = cat.substitute([(cvar, var)])
return (cat, var)
if catstr in primitives:
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr, subscrs), var)
raise AssertionError(
'String \'' + catstr + '\' is neither a family nor primitive category.'
)
def augParseCategory(line, primitives, families, var=None):
"""
Parse a string representing a category, and returns a tuple with
(possibly) the CCG variable for the category
"""
(cat_string, rest) = nextCategory(line)
if cat_string.startswith('('):
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
# print rePrim.match(str).groups()
(res, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
while rest != "":
app = APP_RE.match(rest).groups()
direction = parseApplication(app[0:3])
rest = app[3]
(cat_string, rest) = nextCategory(rest)
if cat_string.startswith('('):
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(arg, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
res = FunctionalCategory(res, arg, direction)
return (res, var)
def fromstring(lex_str, include_semantics=False):
"""
Convert string representation into a lexicon for CCGs.
"""
CCGVar.reset_id()
primitives = []
families = {}
entries = defaultdict(list)
for line in lex_str.splitlines():
# Strip comments and leading/trailing whitespace.
line = COMMENTS_RE.match(line).groups()[0].strip()
if line == "":
continue
if line.startswith(':-'):
# A line of primitive categories.
# The first one is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [
prim.strip() for prim in line[2:].strip().split(',')
]
else:
# Either a family definition, or a word definition
(ident, sep, rhs) = LEX_RE.match(line).groups()
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
if sep == '::':
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
else:
semantics = None
if include_semantics is True:
if semantics_str is None:
raise AssertionError(
line
+ " must contain semantics because include_semantics is set to True"
)
else:
semantics = Expression.fromstring(
SEMANTICS_RE.match(semantics_str).groups()[0]
)
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(Token(ident, cat, semantics))
return CCGLexicon(primitives[0], primitives, families, entries)
@deprecated('Use fromstring() instead.')
def parseLexicon(lex_str):
return fromstring(lex_str)
openccg_tinytiny = fromstring(
"""
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
# Only incorporates a subset of the morphological subcategories, however.
:- S,NP,N # Primitive categories
Det :: NP/N # Determiners
Pro :: NP
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
IntransVpl :: S\\NP[pl] # Plural
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
TransVpl :: S\\NP[pl]/NP # Plural
the => NP[sg]/N[sg]
the => NP[pl]/N[pl]
I => Pro
me => Pro
we => Pro
us => Pro
book => N[sg]
books => N[pl]
peach => N[sg]
peaches => N[pl]
policeman => N[sg]
policemen => N[pl]
boy => N[sg]
boys => N[pl]
sleep => IntransVsg
sleep => IntransVpl
eat => IntransVpl
eat => TransVpl
eats => IntransVsg
eats => TransVsg
see => TransVpl
sees => TransVsg
"""
)