You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

262 lines
7.4 KiB
Python

# Natural Language Toolkit: Lexical Functional Grammar
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, division, unicode_literals
from itertools import chain
from nltk.internals import Counter
from nltk.compat import python_2_unicode_compatible
@python_2_unicode_compatible
class FStructure(dict):
def safeappend(self, key, item):
"""
Append 'item' to the list at 'key'. If no list exists for 'key', then
construct one.
"""
if key not in self:
self[key] = []
self[key].append(item)
def __setitem__(self, key, value):
dict.__setitem__(self, key.lower(), value)
def __getitem__(self, key):
return dict.__getitem__(self, key.lower())
def __contains__(self, key):
return dict.__contains__(self, key.lower())
def to_glueformula_list(self, glue_dict):
depgraph = self.to_depgraph()
return glue_dict.to_glueformula_list(depgraph)
def to_depgraph(self, rel=None):
from nltk.parse.dependencygraph import DependencyGraph
depgraph = DependencyGraph()
nodes = depgraph.nodes
self._to_depgraph(nodes, 0, 'ROOT')
# Add all the dependencies for all the nodes
for address, node in nodes.items():
for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'):
if n2['head'] == address:
relation = n2['rel']
node['deps'].setdefault(relation, [])
node['deps'][relation].append(n2['address'])
depgraph.root = nodes[1]
return depgraph
def _to_depgraph(self, nodes, head, rel):
index = len(nodes)
nodes[index].update(
{
'address': index,
'word': self.pred[0],
'tag': self.pred[1],
'head': head,
'rel': rel,
}
)
for feature in sorted(self):
for item in sorted(self[feature]):
if isinstance(item, FStructure):
item._to_depgraph(nodes, index, feature)
elif isinstance(item, tuple):
new_index = len(nodes)
nodes[new_index].update(
{
'address': new_index,
'word': item[0],
'tag': item[1],
'head': index,
'rel': feature,
}
)
elif isinstance(item, list):
for n in item:
n._to_depgraph(nodes, index, feature)
else:
raise Exception(
'feature %s is not an FStruct, a list, or a tuple' % feature
)
@staticmethod
def read_depgraph(depgraph):
return FStructure._read_depgraph(depgraph.root, depgraph)
@staticmethod
def _read_depgraph(node, depgraph, label_counter=None, parent=None):
if not label_counter:
label_counter = Counter()
if node['rel'].lower() in ['spec', 'punct']:
# the value of a 'spec' entry is a word, not an FStructure
return (node['word'], node['tag'])
else:
fstruct = FStructure()
fstruct.pred = None
fstruct.label = FStructure._make_label(label_counter.get())
fstruct.parent = parent
word, tag = node['word'], node['tag']
if tag[:2] == 'VB':
if tag[2:3] == 'D':
fstruct.safeappend('tense', ('PAST', 'tense'))
fstruct.pred = (word, tag[:2])
if not fstruct.pred:
fstruct.pred = (word, tag)
children = [depgraph.nodes[idx] for idx in chain(*node['deps'].values())]
for child in children:
fstruct.safeappend(
child['rel'],
FStructure._read_depgraph(child, depgraph, label_counter, fstruct),
)
return fstruct
@staticmethod
def _make_label(value):
"""
Pick an alphabetic character as identifier for an entity in the model.
:param value: where to index into the list of characters
:type value: int
"""
letter = [
'f',
'g',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
's',
't',
'u',
'v',
'w',
'x',
'y',
'z',
'a',
'b',
'c',
'd',
'e',
][value - 1]
num = int(value) // 26
if num > 0:
return letter + str(num)
else:
return letter
def __repr__(self):
return self.__unicode__().replace('\n', '')
def __str__(self):
return self.pretty_format()
def pretty_format(self, indent=3):
try:
accum = '%s:[' % self.label
except NameError:
accum = '['
try:
accum += 'pred \'%s\'' % (self.pred[0])
except NameError:
pass
for feature in sorted(self):
for item in self[feature]:
if isinstance(item, FStructure):
next_indent = indent + len(feature) + 3 + len(self.label)
accum += '\n%s%s %s' % (
' ' * (indent),
feature,
item.pretty_format(next_indent),
)
elif isinstance(item, tuple):
accum += '\n%s%s \'%s\'' % (' ' * (indent), feature, item[0])
elif isinstance(item, list):
accum += '\n%s%s {%s}' % (
' ' * (indent),
feature,
('\n%s' % (' ' * (indent + len(feature) + 2))).join(item),
)
else: # ERROR
raise Exception(
'feature %s is not an FStruct, a list, or a tuple' % feature
)
return accum + ']'
def demo_read_depgraph():
from nltk.parse.dependencygraph import DependencyGraph
dg1 = DependencyGraph(
"""\
Esso NNP 2 SUB
said VBD 0 ROOT
the DT 5 NMOD
Whiting NNP 5 NMOD
field NN 6 SUB
started VBD 2 VMOD
production NN 6 OBJ
Tuesday NNP 6 VMOD
"""
)
dg2 = DependencyGraph(
"""\
John NNP 2 SUB
sees VBP 0 ROOT
Mary NNP 2 OBJ
"""
)
dg3 = DependencyGraph(
"""\
a DT 2 SPEC
man NN 3 SUBJ
walks VB 0 ROOT
"""
)
dg4 = DependencyGraph(
"""\
every DT 2 SPEC
girl NN 3 SUBJ
chases VB 0 ROOT
a DT 5 SPEC
dog NN 3 OBJ
"""
)
depgraphs = [dg1, dg2, dg3, dg4]
for dg in depgraphs:
print(FStructure.read_depgraph(dg))
if __name__ == '__main__':
demo_read_depgraph()