You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

786 lines
30 KiB
Python

5 years ago
# Natural Language Toolkit: Dependency Grammars
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (modifications)
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Tools for reading and writing dependency trees.
The input is assumed to be in Malt-TAB format
(http://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
"""
from __future__ import print_function, unicode_literals
from collections import defaultdict
from itertools import chain
from pprint import pformat
import subprocess
import warnings
from six import string_types
from nltk.tree import Tree
from nltk.compat import python_2_unicode_compatible
#################################################################
# DependencyGraph Class
#################################################################
@python_2_unicode_compatible
class DependencyGraph(object):
"""
A container for the nodes and labelled edges of a dependency structure.
"""
def __init__(
self,
tree_str=None,
cell_extractor=None,
zero_based=False,
cell_separator=None,
top_relation_label='ROOT',
):
"""Dependency graph.
We place a dummy `TOP` node with the index 0, since the root node is
often assigned 0 as its head. This also means that the indexing of the
nodes corresponds directly to the Malt-TAB format, which starts at 1.
If zero-based is True, then Malt-TAB-like input with node numbers
starting at 0 and the root node assigned -1 (as produced by, e.g.,
zpar).
:param str cell_separator: the cell separator. If not provided, cells
are split by whitespace.
:param str top_relation_label: the label by which the top relation is
identified, for examlple, `ROOT`, `null` or `TOP`.
"""
self.nodes = defaultdict(
lambda: {
'address': None,
'word': None,
'lemma': None,
'ctag': None,
'tag': None,
'feats': None,
'head': None,
'deps': defaultdict(list),
'rel': None,
}
)
self.nodes[0].update({'ctag': 'TOP', 'tag': 'TOP', 'address': 0})
self.root = None
if tree_str:
self._parse(
tree_str,
cell_extractor=cell_extractor,
zero_based=zero_based,
cell_separator=cell_separator,
top_relation_label=top_relation_label,
)
def remove_by_address(self, address):
"""
Removes the node with the given address. References
to this node in others will still exist.
"""
del self.nodes[address]
def redirect_arcs(self, originals, redirect):
"""
Redirects arcs to any of the nodes in the originals list
to the redirect node address.
"""
for node in self.nodes.values():
new_deps = []
for dep in node['deps']:
if dep in originals:
new_deps.append(redirect)
else:
new_deps.append(dep)
node['deps'] = new_deps
def add_arc(self, head_address, mod_address):
"""
Adds an arc from the node specified by head_address to the
node specified by the mod address.
"""
relation = self.nodes[mod_address]['rel']
self.nodes[head_address]['deps'].setdefault(relation, [])
self.nodes[head_address]['deps'][relation].append(mod_address)
# self.nodes[head_address]['deps'].append(mod_address)
def connect_graph(self):
"""
Fully connects all non-root nodes. All nodes are set to be dependents
of the root node.
"""
for node1 in self.nodes.values():
for node2 in self.nodes.values():
if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
relation = node2['rel']
node1['deps'].setdefault(relation, [])
node1['deps'][relation].append(node2['address'])
# node1['deps'].append(node2['address'])
def get_by_address(self, node_address):
"""Return the node with the given address."""
return self.nodes[node_address]
def contains_address(self, node_address):
"""
Returns true if the graph contains a node with the given node
address, false otherwise.
"""
return node_address in self.nodes
def to_dot(self):
"""Return a dot representation suitable for using with Graphviz.
>>> dg = DependencyGraph(
... 'John N 2\\n'
... 'loves V 0\\n'
... 'Mary N 2'
... )
>>> print(dg.to_dot())
digraph G{
edge [dir=forward]
node [shape=plaintext]
<BLANKLINE>
0 [label="0 (None)"]
0 -> 2 [label="ROOT"]
1 [label="1 (John)"]
2 [label="2 (loves)"]
2 -> 1 [label=""]
2 -> 3 [label=""]
3 [label="3 (Mary)"]
}
"""
# Start the digraph specification
s = 'digraph G{\n'
s += 'edge [dir=forward]\n'
s += 'node [shape=plaintext]\n'
# Draw the remaining nodes
for node in sorted(self.nodes.values(), key=lambda v: v['address']):
s += '\n%s [label="%s (%s)"]' % (
node['address'],
node['address'],
node['word'],
)
for rel, deps in node['deps'].items():
for dep in deps:
if rel is not None:
s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel)
else:
s += '\n%s -> %s ' % (node['address'], dep)
s += "\n}"
return s
def _repr_svg_(self):
"""Show SVG representation of the transducer (IPython magic).
>>> dg = DependencyGraph(
... 'John N 2\\n'
... 'loves V 0\\n'
... 'Mary N 2'
... )
>>> dg._repr_svg_().split('\\n')[0]
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>'
"""
dot_string = self.to_dot()
try:
process = subprocess.Popen(
['dot', '-Tsvg'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
except OSError:
raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
if err:
raise Exception(
'Cannot create svg representation by running dot from string: {}'
''.format(dot_string)
)
return out
def __str__(self):
return pformat(self.nodes)
def __repr__(self):
return "<DependencyGraph with {0} nodes>".format(len(self.nodes))
@staticmethod
def load(
filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'
):
"""
:param filename: a name of a file in Malt-TAB format
:param zero_based: nodes in the input file are numbered starting from 0
rather than 1 (as produced by, e.g., zpar)
:param str cell_separator: the cell separator. If not provided, cells
are split by whitespace.
:param str top_relation_label: the label by which the top relation is
identified, for examlple, `ROOT`, `null` or `TOP`.
:return: a list of DependencyGraphs
"""
with open(filename) as infile:
return [
DependencyGraph(
tree_str,
zero_based=zero_based,
cell_separator=cell_separator,
top_relation_label=top_relation_label,
)
for tree_str in infile.read().split('\n\n')
]
def left_children(self, node_index):
"""
Returns the number of left children under the node specified
by the given address.
"""
children = chain.from_iterable(self.nodes[node_index]['deps'].values())
index = self.nodes[node_index]['address']
return sum(1 for c in children if c < index)
def right_children(self, node_index):
"""
Returns the number of right children under the node specified
by the given address.
"""
children = chain.from_iterable(self.nodes[node_index]['deps'].values())
index = self.nodes[node_index]['address']
return sum(1 for c in children if c > index)
def add_node(self, node):
if not self.contains_address(node['address']):
self.nodes[node['address']].update(node)
def _parse(
self,
input_,
cell_extractor=None,
zero_based=False,
cell_separator=None,
top_relation_label='ROOT',
):
"""Parse a sentence.
:param extractor: a function that given a tuple of cells returns a
7-tuple, where the values are ``word, lemma, ctag, tag, feats, head,
rel``.
:param str cell_separator: the cell separator. If not provided, cells
are split by whitespace.
:param str top_relation_label: the label by which the top relation is
identified, for examlple, `ROOT`, `null` or `TOP`.
"""
def extract_3_cells(cells, index):
word, tag, head = cells
return index, word, word, tag, tag, '', head, ''
def extract_4_cells(cells, index):
word, tag, head, rel = cells
return index, word, word, tag, tag, '', head, rel
def extract_7_cells(cells, index):
line_index, word, lemma, tag, _, head, rel = cells
try:
index = int(line_index)
except ValueError:
# index can't be parsed as an integer, use default
pass
return index, word, lemma, tag, tag, '', head, rel
def extract_10_cells(cells, index):
line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
try:
index = int(line_index)
except ValueError:
# index can't be parsed as an integer, use default
pass
return index, word, lemma, ctag, tag, feats, head, rel
extractors = {
3: extract_3_cells,
4: extract_4_cells,
7: extract_7_cells,
10: extract_10_cells,
}
if isinstance(input_, string_types):
input_ = (line for line in input_.split('\n'))
lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l)
cell_number = None
for index, line in enumerate(lines, start=1):
cells = line.split(cell_separator)
if cell_number is None:
cell_number = len(cells)
else:
assert cell_number == len(cells)
if cell_extractor is None:
try:
cell_extractor = extractors[cell_number]
except KeyError:
raise ValueError(
'Number of tab-delimited fields ({0}) not supported by '
'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
)
try:
index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(
cells, index
)
except (TypeError, ValueError):
# cell_extractor doesn't take 2 arguments or doesn't return 8
# values; assume the cell_extractor is an older external
# extractor and doesn't accept or return an index.
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
if head == '_':
continue
head = int(head)
if zero_based:
head += 1
self.nodes[index].update(
{
'address': index,
'word': word,
'lemma': lemma,
'ctag': ctag,
'tag': tag,
'feats': feats,
'head': head,
'rel': rel,
}
)
# Make sure that the fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = top_relation_label
self.nodes[head]['deps'][rel].append(index)
if self.nodes[0]['deps'][top_relation_label]:
root_address = self.nodes[0]['deps'][top_relation_label][0]
self.root = self.nodes[root_address]
self.top_relation_label = top_relation_label
else:
warnings.warn(
"The graph doesn't contain a node " "that depends on the root element."
)
def _word(self, node, filter=True):
w = node['word']
if filter:
if w != ',':
return w
return w
def _tree(self, i):
""" Turn dependency graphs into NLTK trees.
:param int i: index of a node
:return: either a word (if the indexed node is a leaf) or a ``Tree``.
"""
node = self.get_by_address(i)
word = node['word']
deps = sorted(chain.from_iterable(node['deps'].values()))
if deps:
return Tree(word, [self._tree(dep) for dep in deps])
else:
return word
def tree(self):
"""
Starting with the ``root`` node, build a dependency tree using the NLTK
``Tree`` constructor. Dependency labels are omitted.
"""
node = self.root
word = node['word']
deps = sorted(chain.from_iterable(node['deps'].values()))
return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None):
"""
Extract dependency triples of the form:
((head word, head tag), rel, (dep word, dep tag))
"""
if not node:
node = self.root
head = (node['word'], node['ctag'])
for i in sorted(chain.from_iterable(node['deps'].values())):
dep = self.get_by_address(i)
yield (head, dep['rel'], (dep['word'], dep['ctag']))
for triple in self.triples(node=dep):
yield triple
def _hd(self, i):
try:
return self.nodes[i]['head']
except IndexError:
return None
def _rel(self, i):
try:
return self.nodes[i]['rel']
except IndexError:
return None
# what's the return type? Boolean or list?
def contains_cycle(self):
"""Check whether there are cycles.
>>> dg = DependencyGraph(treebank_data)
>>> dg.contains_cycle()
False
>>> cyclic_dg = DependencyGraph()
>>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
>>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
>>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
>>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
>>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
>>> cyclic_dg.nodes = {
... 0: top,
... 1: child1,
... 2: child2,
... 3: child3,
... 4: child4,
... }
>>> cyclic_dg.root = top
>>> cyclic_dg.contains_cycle()
[3, 1, 2, 4]
"""
distances = {}
for node in self.nodes.values():
for dep in node['deps']:
key = tuple([node['address'], dep])
distances[key] = 1
for _ in self.nodes:
new_entries = {}
for pair1 in distances:
for pair2 in distances:
if pair1[1] == pair2[0]:
key = tuple([pair1[0], pair2[1]])
new_entries[key] = distances[pair1] + distances[pair2]
for pair in new_entries:
distances[pair] = new_entries[pair]
if pair[0] == pair[1]:
path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0])
return path
return False # return []?
def get_cycle_path(self, curr_node, goal_node_index):
for dep in curr_node['deps']:
if dep == goal_node_index:
return [curr_node['address']]
for dep in curr_node['deps']:
path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
if len(path) > 0:
path.insert(0, curr_node['address'])
return path
return []
def to_conll(self, style):
"""
The dependency graph in CoNLL format.
:param style: the style to use for the format (3, 4, 10 columns)
:type style: int
:rtype: str
"""
if style == 3:
template = '{word}\t{tag}\t{head}\n'
elif style == 4:
template = '{word}\t{tag}\t{head}\t{rel}\n'
elif style == 10:
template = (
'{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n'
)
else:
raise ValueError(
'Number of tab-delimited fields ({0}) not supported by '
'CoNLL(10) or Malt-Tab(4) format'.format(style)
)
return ''.join(
template.format(i=i, **node)
for i, node in sorted(self.nodes.items())
if node['tag'] != 'TOP'
)
def nx_graph(self):
"""Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
import networkx
nx_nodelist = list(range(1, len(self.nodes)))
nx_edgelist = [
(n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n)
]
self.nx_labels = {}
for n in nx_nodelist:
self.nx_labels[n] = self.nodes[n]['word']
g = networkx.MultiDiGraph()
g.add_nodes_from(nx_nodelist)
g.add_edges_from(nx_edgelist)
return g
class DependencyGraphError(Exception):
"""Dependency graph exception."""
def demo():
malt_demo()
conll_demo()
conll_file_demo()
cycle_finding_demo()
def malt_demo(nx=False):
"""
A demonstration of the result of reading a dependency
version of the first sentence of the Penn Treebank.
"""
dg = DependencyGraph(
"""Pierre NNP 2 NMOD
Vinken NNP 8 SUB
, , 2 P
61 CD 5 NMOD
years NNS 6 AMOD
old JJ 2 NMOD
, , 2 P
will MD 0 ROOT
join VB 8 VC
the DT 11 NMOD
board NN 9 OBJ
as IN 9 VMOD
a DT 15 NMOD
nonexecutive JJ 15 NMOD
director NN 12 PMOD
Nov. NNP 9 VMOD
29 CD 16 NMOD
. . 9 VMOD
"""
)
tree = dg.tree()
tree.pprint()
if nx:
# currently doesn't work
import networkx
from matplotlib import pylab
g = dg.nx_graph()
g.info()
pos = networkx.spring_layout(g, dim=1)
networkx.draw_networkx_nodes(g, pos, node_size=50)
# networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
networkx.draw_networkx_labels(g, pos, dg.nx_labels)
pylab.xticks([])
pylab.yticks([])
pylab.savefig('tree.png')
pylab.show()
def conll_demo():
"""
A demonstration of how to read a string representation of
a CoNLL format dependency tree.
"""
dg = DependencyGraph(conll_data1)
tree = dg.tree()
tree.pprint()
print(dg)
print(dg.to_conll(4))
def conll_file_demo():
print('Mass conll_read demo...')
graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
for graph in graphs:
tree = graph.tree()
print('\n')
tree.pprint()
def cycle_finding_demo():
dg = DependencyGraph(treebank_data)
print(dg.contains_cycle())
cyclic_dg = DependencyGraph()
cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
print(cyclic_dg.contains_cycle())
treebank_data = """Pierre NNP 2 NMOD
Vinken NNP 8 SUB
, , 2 P
61 CD 5 NMOD
years NNS 6 AMOD
old JJ 2 NMOD
, , 2 P
will MD 0 ROOT
join VB 8 VC
the DT 11 NMOD
board NN 9 OBJ
as IN 9 VMOD
a DT 15 NMOD
nonexecutive JJ 15 NMOD
director NN 12 PMOD
Nov. NNP 9 VMOD
29 CD 16 NMOD
. . 9 VMOD
"""
conll_data1 = """
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 met met Prep Prep voor 8 mod _ _
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
7 gaan ga V V hulp|inf 6 vc _ _
8 winkelen winkel V V intrans|inf 11 cnj _ _
9 , , Punc Punc komma 8 punct _ _
10 zwemmen zwem V V intrans|inf 11 cnj _ _
11 of of Conj Conj neven 7 vc _ _
12 terrassen terras N N soort|mv|neut 11 cnj _ _
13 . . Punc Punc punt 12 punct _ _
"""
conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _
2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _
4 wild wild Adj Adj attr|stell|onverv 5 mod _ _
5 zwaaien zwaai N N soort|mv|neut 2 vc _ _
6 . . Punc Punc punt 5 punct _ _
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 met met Prep Prep voor 8 mod _ _
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
7 gaan ga V V hulp|inf 6 vc _ _
8 winkelen winkel V V intrans|inf 11 cnj _ _
9 , , Punc Punc komma 8 punct _ _
10 zwemmen zwem V V intrans|inf 11 cnj _ _
11 of of Conj Conj neven 7 vc _ _
12 terrassen terras N N soort|mv|neut 11 cnj _ _
13 . . Punc Punc punt 12 punct _ _
1 Dat dat Pron Pron aanw|neut|attr 2 det _ _
2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _
5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _
6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _
7 . . Punc Punc punt 6 punct _ _
1 Het het Pron Pron onbep|neut|zelfst 2 su _ _
2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 bij bij Prep Prep voor 2 ld _ _
4 de de Art Art bep|zijdofmv|neut 6 det _ _
5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _
6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _
7 die die Pron Pron betr|neut|zelfst 6 mod _ _
8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _
9 ginds ginds Adv Adv gew|aanw 12 mod _ _
10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _
11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _
12 gelaten laat V V trans|verldw|onverv 11 vc _ _
13 . . Punc Punc punt 12 punct _ _
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _
3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _
4 naast naast Prep Prep voor 11 mod _ _
5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _
6 op op Prep Prep voor 11 ld _ _
7 de de Art Art bep|zijdofmv|neut 8 det _ _
8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _
9 kunnen kan V V hulp|inf 2 vc _ _
10 gaan ga V V hulp|inf 9 vc _ _
11 liggen lig V V intrans|inf 10 vc _ _
12 . . Punc Punc punt 11 punct _ _
1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _
2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _
3 mams mams N N soort|ev|neut 4 det _ _
4 rug rug N N soort|ev|neut 5 obj1 _ _
5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _
6 hebben heb V V hulp|inf 2 vc _ _
7 en en Conj Conj neven 0 ROOT _ _
8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _
9 de de Art Art bep|zijdofmv|neut 10 det _ _
10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _
11 . . Punc Punc punt 10 punct _ _
1 Of of Conj Conj onder|metfin 0 ROOT _ _
2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _
5 met met Prep Prep voor 10 mod _ _
6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _
7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _
8 rond rond Adv Adv deelv 10 svp _ _
9 kunnen kan V V hulp|inf 3 vc _ _
10 slenteren slenter V V intrans|inf 9 vc _ _
11 in in Prep Prep voor 10 mod _ _
12 de de Art Art bep|zijdofmv|neut 13 det _ _
13 buurt buurt N N soort|ev|neut 11 obj1 _ _
14 van van Prep Prep voor 13 mod _ _
15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _
16 . . Punc Punc punt 15 punct _ _
"""
if __name__ == '__main__':
demo()