You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

242 lines
7.3 KiB
Plaintext

5 years ago
.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===================
Dependency Grammars
===================
>>> from nltk.grammar import DependencyGrammar
>>> from nltk.parse import (
... DependencyGraph,
... ProjectiveDependencyParser,
... NonprojectiveDependencyParser,
... )
CoNLL Data
----------
>>> treebank_data = """Pierre NNP 2 NMOD
... Vinken NNP 8 SUB
... , , 2 P
... 61 CD 5 NMOD
... years NNS 6 AMOD
... old JJ 2 NMOD
... , , 2 P
... will MD 0 ROOT
... join VB 8 VC
... the DT 11 NMOD
... board NN 9 OBJ
... as IN 9 VMOD
... a DT 15 NMOD
... nonexecutive JJ 15 NMOD
... director NN 12 PMOD
... Nov. NNP 9 VMOD
... 29 CD 16 NMOD
... . . 9 VMOD
... """
>>> dg = DependencyGraph(treebank_data)
>>> dg.tree().pprint()
(will
(Vinken Pierre , (old (years 61)) ,)
(join (board the) (as (director a nonexecutive)) (Nov. 29) .))
>>> for head, rel, dep in dg.triples():
... print(
... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
... .format(h=head, r=rel, d=dep)
... )
(will, MD), SUB, (Vinken, NNP)
(Vinken, NNP), NMOD, (Pierre, NNP)
(Vinken, NNP), P, (,, ,)
(Vinken, NNP), NMOD, (old, JJ)
(old, JJ), AMOD, (years, NNS)
(years, NNS), NMOD, (61, CD)
(Vinken, NNP), P, (,, ,)
(will, MD), VC, (join, VB)
(join, VB), OBJ, (board, NN)
(board, NN), NMOD, (the, DT)
(join, VB), VMOD, (as, IN)
(as, IN), PMOD, (director, NN)
(director, NN), NMOD, (a, DT)
(director, NN), NMOD, (nonexecutive, JJ)
(join, VB), VMOD, (Nov., NNP)
(Nov., NNP), NMOD, (29, CD)
(join, VB), VMOD, (., .)
Using a custom cell extractor.
>>> def custom_extractor(cells):
... _, tag, head, rel = cells
... return 'spam', 'spam', tag, tag, '', head, rel
>>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
>>> dg.tree().pprint()
(spam
(spam spam spam (spam (spam spam)) spam)
(spam (spam spam) (spam (spam spam spam)) (spam spam) spam))
Custom cell extractors can take in and return an index.
>>> def custom_extractor(cells, index):
... word, tag, head, rel = cells
... return (index, '{}-{}'.format(word, index), word,
... tag, tag, '', head, rel)
>>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
>>> dg.tree().pprint()
(will-8
(Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7)
(join-9
(board-11 the-10)
(as-12 (director-15 a-13 nonexecutive-14))
(Nov.-16 29-17)
.-18))
Using the dependency-parsed version of the Penn Treebank corpus sample.
>>> from nltk.corpus import dependency_treebank
>>> t = dependency_treebank.parsed_sents()[0]
>>> print(t.to_conll(3)) # doctest: +NORMALIZE_WHITESPACE
Pierre NNP 2
Vinken NNP 8
, , 2
61 CD 5
years NNS 6
old JJ 2
, , 2
will MD 0
join VB 8
the DT 11
board NN 9
as IN 9
a DT 15
nonexecutive JJ 15
director NN 12
Nov. NNP 9
29 CD 16
. . 8
Using the output of zpar (like Malt-TAB but with zero-based indexing)
>>> zpar_data = """
... Pierre NNP 1 NMOD
... Vinken NNP 7 SUB
... , , 1 P
... 61 CD 4 NMOD
... years NNS 5 AMOD
... old JJ 1 NMOD
... , , 1 P
... will MD -1 ROOT
... join VB 7 VC
... the DT 10 NMOD
... board NN 8 OBJ
... as IN 8 VMOD
... a DT 14 NMOD
... nonexecutive JJ 14 NMOD
... director NN 11 PMOD
... Nov. NNP 8 VMOD
... 29 CD 15 NMOD
... . . 7 P
... """
>>> zdg = DependencyGraph(zpar_data, zero_based=True)
>>> print(zdg.tree())
(will
(Vinken Pierre , (old (years 61)) ,)
(join (board the) (as (director a nonexecutive)) (Nov. 29))
.)
Projective Dependency Parsing
-----------------------------
>>> grammar = DependencyGrammar.fromstring("""
... 'fell' -> 'price' | 'stock'
... 'price' -> 'of' 'the'
... 'of' -> 'stock'
... 'stock' -> 'the'
... """)
>>> print(grammar)
Dependency grammar with 5 productions
'fell' -> 'price'
'fell' -> 'stock'
'price' -> 'of' 'the'
'of' -> 'stock'
'stock' -> 'the'
>>> dp = ProjectiveDependencyParser(grammar)
>>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])):
... print(t)
(fell (price the (of (stock the))))
(fell (price the of) (stock the))
(fell (price the of the) stock)
Non-Projective Dependency Parsing
---------------------------------
>>> grammar = DependencyGrammar.fromstring("""
... 'taught' -> 'play' | 'man'
... 'man' -> 'the'
... 'play' -> 'golf' | 'dog' | 'to'
... 'dog' -> 'his'
... """)
>>> print(grammar)
Dependency grammar with 7 productions
'taught' -> 'play'
'taught' -> 'man'
'man' -> 'the'
'play' -> 'golf'
'play' -> 'dog'
'play' -> 'to'
'dog' -> 'his'
>>> dp = NonprojectiveDependencyParser(grammar)
>>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
>>> print(g.root['word'])
taught
>>> for _, node in sorted(g.nodes.items()):
... if node['word'] is not None:
... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
1 the: []
2 man: [1]
3 taught: [2, 7]
4 his: []
5 dog: [4]
6 to: []
7 play: [5, 6, 8]
8 golf: []
>>> print(g.tree())
(taught (man the) (play (dog his) to golf))
Integration with MALT parser
============================
In case the top relation is different from the default, we can set it. In case
of MALT parser, it's set to `'null'`.
>>> dg_str = """1 I _ NN NN _ 2 nn _ _
... 2 shot _ NN NN _ 0 null _ _
... 3 an _ AT AT _ 2 dep _ _
... 4 elephant _ NN NN _ 7 nn _ _
... 5 in _ NN NN _ 7 nn _ _
... 6 my _ NN NN _ 7 nn _ _
... 7 pajamas _ NNS NNS _ 3 dobj _ _
... """
>>> dg = DependencyGraph(dg_str, top_relation_label='null')
>>> len(dg.nodes)
8
>>> dg.root['word'], dg.root['address']
('shot', 2)
>>> print(dg.to_conll(10)) # doctest: +NORMALIZE_WHITESPACE
1 I _ NN NN _ 2 nn _ _
2 shot _ NN NN _ 0 null _ _
3 an _ AT AT _ 2 dep _ _
4 elephant _ NN NN _ 7 nn _ _
5 in _ NN NN _ 7 nn _ _
6 my _ NN NN _ 7 nn _ _
7 pajamas _ NNS NNS _ 3 dobj _ _