.. Copyright (C) 2001-2020 NLTK Project .. For license information, see LICENSE.TXT =================== Dependency Grammars =================== >>> from nltk.grammar import DependencyGrammar >>> from nltk.parse import ( ... DependencyGraph, ... ProjectiveDependencyParser, ... NonprojectiveDependencyParser, ... ) CoNLL Data ---------- >>> treebank_data = """Pierre NNP 2 NMOD ... Vinken NNP 8 SUB ... , , 2 P ... 61 CD 5 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 2 P ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 NMOD ... board NN 9 OBJ ... as IN 9 VMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... """ >>> dg = DependencyGraph(treebank_data) >>> dg.tree().pprint() (will (Vinken Pierre , (old (years 61)) ,) (join (board the) (as (director a nonexecutive)) (Nov. 29) .)) >>> for head, rel, dep in dg.triples(): ... print( ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})' ... .format(h=head, r=rel, d=dep) ... ) (will, MD), SUB, (Vinken, NNP) (Vinken, NNP), NMOD, (Pierre, NNP) (Vinken, NNP), P, (,, ,) (Vinken, NNP), NMOD, (old, JJ) (old, JJ), AMOD, (years, NNS) (years, NNS), NMOD, (61, CD) (Vinken, NNP), P, (,, ,) (will, MD), VC, (join, VB) (join, VB), OBJ, (board, NN) (board, NN), NMOD, (the, DT) (join, VB), VMOD, (as, IN) (as, IN), PMOD, (director, NN) (director, NN), NMOD, (a, DT) (director, NN), NMOD, (nonexecutive, JJ) (join, VB), VMOD, (Nov., NNP) (Nov., NNP), NMOD, (29, CD) (join, VB), VMOD, (., .) Using a custom cell extractor. >>> def custom_extractor(cells): ... _, tag, head, rel = cells ... return 'spam', 'spam', tag, tag, '', head, rel >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) >>> dg.tree().pprint() (spam (spam spam spam (spam (spam spam)) spam) (spam (spam spam) (spam (spam spam spam)) (spam spam) spam)) Custom cell extractors can take in and return an index. >>> def custom_extractor(cells, index): ... word, tag, head, rel = cells ... return (index, '{}-{}'.format(word, index), word, ... tag, tag, '', head, rel) >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) >>> dg.tree().pprint() (will-8 (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7) (join-9 (board-11 the-10) (as-12 (director-15 a-13 nonexecutive-14)) (Nov.-16 29-17) .-18)) Using the dependency-parsed version of the Penn Treebank corpus sample. >>> from nltk.corpus import dependency_treebank >>> t = dependency_treebank.parsed_sents()[0] >>> print(t.to_conll(3)) # doctest: +NORMALIZE_WHITESPACE Pierre NNP 2 Vinken NNP 8 , , 2 61 CD 5 years NNS 6 old JJ 2 , , 2 will MD 0 join VB 8 the DT 11 board NN 9 as IN 9 a DT 15 nonexecutive JJ 15 director NN 12 Nov. NNP 9 29 CD 16 . . 8 Using the output of zpar (like Malt-TAB but with zero-based indexing) >>> zpar_data = """ ... Pierre NNP 1 NMOD ... Vinken NNP 7 SUB ... , , 1 P ... 61 CD 4 NMOD ... years NNS 5 AMOD ... old JJ 1 NMOD ... , , 1 P ... will MD -1 ROOT ... join VB 7 VC ... the DT 10 NMOD ... board NN 8 OBJ ... as IN 8 VMOD ... a DT 14 NMOD ... nonexecutive JJ 14 NMOD ... director NN 11 PMOD ... Nov. NNP 8 VMOD ... 29 CD 15 NMOD ... . . 7 P ... """ >>> zdg = DependencyGraph(zpar_data, zero_based=True) >>> print(zdg.tree()) (will (Vinken Pierre , (old (years 61)) ,) (join (board the) (as (director a nonexecutive)) (Nov. 29)) .) Projective Dependency Parsing ----------------------------- >>> grammar = DependencyGrammar.fromstring(""" ... 'fell' -> 'price' | 'stock' ... 'price' -> 'of' 'the' ... 'of' -> 'stock' ... 'stock' -> 'the' ... """) >>> print(grammar) Dependency grammar with 5 productions 'fell' -> 'price' 'fell' -> 'stock' 'price' -> 'of' 'the' 'of' -> 'stock' 'stock' -> 'the' >>> dp = ProjectiveDependencyParser(grammar) >>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])): ... print(t) (fell (price the (of (stock the)))) (fell (price the of) (stock the)) (fell (price the of the) stock) Non-Projective Dependency Parsing --------------------------------- >>> grammar = DependencyGrammar.fromstring(""" ... 'taught' -> 'play' | 'man' ... 'man' -> 'the' ... 'play' -> 'golf' | 'dog' | 'to' ... 'dog' -> 'his' ... """) >>> print(grammar) Dependency grammar with 7 productions 'taught' -> 'play' 'taught' -> 'man' 'man' -> 'the' 'play' -> 'golf' 'play' -> 'dog' 'play' -> 'to' 'dog' -> 'his' >>> dp = NonprojectiveDependencyParser(grammar) >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']) >>> print(g.root['word']) taught >>> for _, node in sorted(g.nodes.items()): ... if node['word'] is not None: ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node)) 1 the: [] 2 man: [1] 3 taught: [2, 7] 4 his: [] 5 dog: [4] 6 to: [] 7 play: [5, 6, 8] 8 golf: [] >>> print(g.tree()) (taught (man the) (play (dog his) to golf)) Integration with MALT parser ============================ In case the top relation is different from the default, we can set it. In case of MALT parser, it's set to `'null'`. >>> dg_str = """1 I _ NN NN _ 2 nn _ _ ... 2 shot _ NN NN _ 0 null _ _ ... 3 an _ AT AT _ 2 dep _ _ ... 4 elephant _ NN NN _ 7 nn _ _ ... 5 in _ NN NN _ 7 nn _ _ ... 6 my _ NN NN _ 7 nn _ _ ... 7 pajamas _ NNS NNS _ 3 dobj _ _ ... """ >>> dg = DependencyGraph(dg_str, top_relation_label='null') >>> len(dg.nodes) 8 >>> dg.root['word'], dg.root['address'] ('shot', 2) >>> print(dg.to_conll(10)) # doctest: +NORMALIZE_WHITESPACE 1 I _ NN NN _ 2 nn _ _ 2 shot _ NN NN _ 0 null _ _ 3 an _ AT AT _ 2 dep _ _ 4 elephant _ NN NN _ 7 nn _ _ 5 in _ NN NN _ 7 nn _ _ 6 my _ NN NN _ 7 nn _ _ 7 pajamas _ NNS NNS _ 3 dobj _ _