|
|
|
# Natural Language Toolkit: Recursive Descent Parser
|
|
|
|
#
|
|
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
|
|
# Steven Bird <stevenbird1@gmail.com>
|
|
|
|
# URL: <http://nltk.org/>
|
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
|
|
from nltk.grammar import Nonterminal
|
|
|
|
from nltk.tree import Tree, ImmutableTree
|
|
|
|
|
|
|
|
from nltk.parse.api import ParserI
|
|
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
|
|
## Recursive Descent Parser
|
|
|
|
##//////////////////////////////////////////////////////
|
|
|
|
class RecursiveDescentParser(ParserI):
|
|
|
|
"""
|
|
|
|
A simple top-down CFG parser that parses texts by recursively
|
|
|
|
expanding the fringe of a Tree, and matching it against a
|
|
|
|
text.
|
|
|
|
|
|
|
|
``RecursiveDescentParser`` uses a list of tree locations called a
|
|
|
|
"frontier" to remember which subtrees have not yet been expanded
|
|
|
|
and which leaves have not yet been matched against the text. Each
|
|
|
|
tree location consists of a list of child indices specifying the
|
|
|
|
path from the root of the tree to a subtree or a leaf; see the
|
|
|
|
reference documentation for Tree for more information
|
|
|
|
about tree locations.
|
|
|
|
|
|
|
|
When the parser begins parsing a text, it constructs a tree
|
|
|
|
containing only the start symbol, and a frontier containing the
|
|
|
|
location of the tree's root node. It then extends the tree to
|
|
|
|
cover the text, using the following recursive procedure:
|
|
|
|
|
|
|
|
- If the frontier is empty, and the text is covered by the tree,
|
|
|
|
then return the tree as a possible parse.
|
|
|
|
- If the frontier is empty, and the text is not covered by the
|
|
|
|
tree, then return no parses.
|
|
|
|
- If the first element of the frontier is a subtree, then
|
|
|
|
use CFG productions to "expand" it. For each applicable
|
|
|
|
production, add the expanded subtree's children to the
|
|
|
|
frontier, and recursively find all parses that can be
|
|
|
|
generated by the new tree and frontier.
|
|
|
|
- If the first element of the frontier is a token, then "match"
|
|
|
|
it against the next token from the text. Remove the token
|
|
|
|
from the frontier, and recursively find all parses that can be
|
|
|
|
generated by the new tree and frontier.
|
|
|
|
|
|
|
|
:see: ``nltk.grammar``
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, grammar, trace=0):
|
|
|
|
"""
|
|
|
|
Create a new ``RecursiveDescentParser``, that uses ``grammar``
|
|
|
|
to parse texts.
|
|
|
|
|
|
|
|
:type grammar: CFG
|
|
|
|
:param grammar: The grammar used to parse texts.
|
|
|
|
:type trace: int
|
|
|
|
:param trace: The level of tracing that should be used when
|
|
|
|
parsing a text. ``0`` will generate no tracing output;
|
|
|
|
and higher numbers will produce more verbose tracing
|
|
|
|
output.
|
|
|
|
"""
|
|
|
|
self._grammar = grammar
|
|
|
|
self._trace = trace
|
|
|
|
|
|
|
|
def grammar(self):
|
|
|
|
return self._grammar
|
|
|
|
|
|
|
|
def parse(self, tokens):
|
|
|
|
# Inherit docs from ParserI
|
|
|
|
|
|
|
|
tokens = list(tokens)
|
|
|
|
self._grammar.check_coverage(tokens)
|
|
|
|
|
|
|
|
# Start a recursive descent parse, with an initial tree
|
|
|
|
# containing just the start symbol.
|
|
|
|
start = self._grammar.start().symbol()
|
|
|
|
initial_tree = Tree(start, [])
|
|
|
|
frontier = [()]
|
|
|
|
if self._trace:
|
|
|
|
self._trace_start(initial_tree, frontier, tokens)
|
|
|
|
return self._parse(tokens, initial_tree, frontier)
|
|
|
|
|
|
|
|
def _parse(self, remaining_text, tree, frontier):
|
|
|
|
"""
|
|
|
|
Recursively expand and match each elements of ``tree``
|
|
|
|
specified by ``frontier``, to cover ``remaining_text``. Return
|
|
|
|
a list of all parses found.
|
|
|
|
|
|
|
|
:return: An iterator of all parses that can be generated by
|
|
|
|
matching and expanding the elements of ``tree``
|
|
|
|
specified by ``frontier``.
|
|
|
|
:rtype: iter(Tree)
|
|
|
|
:type tree: Tree
|
|
|
|
:param tree: A partial structure for the text that is
|
|
|
|
currently being parsed. The elements of ``tree``
|
|
|
|
that are specified by ``frontier`` have not yet been
|
|
|
|
expanded or matched.
|
|
|
|
:type remaining_text: list(str)
|
|
|
|
:param remaining_text: The portion of the text that is not yet
|
|
|
|
covered by ``tree``.
|
|
|
|
:type frontier: list(tuple(int))
|
|
|
|
:param frontier: A list of the locations within ``tree`` of
|
|
|
|
all subtrees that have not yet been expanded, and all
|
|
|
|
leaves that have not yet been matched. This list sorted
|
|
|
|
in left-to-right order of location within the tree.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# If the tree covers the text, and there's nothing left to
|
|
|
|
# expand, then we've found a complete parse; return it.
|
|
|
|
if len(remaining_text) == 0 and len(frontier) == 0:
|
|
|
|
if self._trace:
|
|
|
|
self._trace_succeed(tree, frontier)
|
|
|
|
yield tree
|
|
|
|
|
|
|
|
# If there's still text, but nothing left to expand, we failed.
|
|
|
|
elif len(frontier) == 0:
|
|
|
|
if self._trace:
|
|
|
|
self._trace_backtrack(tree, frontier)
|
|
|
|
|
|
|
|
# If the next element on the frontier is a tree, expand it.
|
|
|
|
elif isinstance(tree[frontier[0]], Tree):
|
|
|
|
for result in self._expand(remaining_text, tree, frontier):
|
|
|
|
yield result
|
|
|
|
|
|
|
|
# If the next element on the frontier is a token, match it.
|
|
|
|
else:
|
|
|
|
for result in self._match(remaining_text, tree, frontier):
|
|
|
|
yield result
|
|
|
|
|
|
|
|
def _match(self, rtext, tree, frontier):
|
|
|
|
"""
|
|
|
|
:rtype: iter(Tree)
|
|
|
|
:return: an iterator of all parses that can be generated by
|
|
|
|
matching the first element of ``frontier`` against the
|
|
|
|
first token in ``rtext``. In particular, if the first
|
|
|
|
element of ``frontier`` has the same type as the first
|
|
|
|
token in ``rtext``, then substitute the token into
|
|
|
|
``tree``; and return all parses that can be generated by
|
|
|
|
matching and expanding the remaining elements of
|
|
|
|
``frontier``. If the first element of ``frontier`` does not
|
|
|
|
have the same type as the first token in ``rtext``, then
|
|
|
|
return empty list.
|
|
|
|
|
|
|
|
:type tree: Tree
|
|
|
|
:param tree: A partial structure for the text that is
|
|
|
|
currently being parsed. The elements of ``tree``
|
|
|
|
that are specified by ``frontier`` have not yet been
|
|
|
|
expanded or matched.
|
|
|
|
:type rtext: list(str)
|
|
|
|
:param rtext: The portion of the text that is not yet
|
|
|
|
covered by ``tree``.
|
|
|
|
:type frontier: list of tuple of int
|
|
|
|
:param frontier: A list of the locations within ``tree`` of
|
|
|
|
all subtrees that have not yet been expanded, and all
|
|
|
|
leaves that have not yet been matched.
|
|
|
|
"""
|
|
|
|
|
|
|
|
tree_leaf = tree[frontier[0]]
|
|
|
|
if len(rtext) > 0 and tree_leaf == rtext[0]:
|
|
|
|
# If it's a terminal that matches rtext[0], then substitute
|
|
|
|
# in the token, and continue parsing.
|
|
|
|
newtree = tree.copy(deep=True)
|
|
|
|
newtree[frontier[0]] = rtext[0]
|
|
|
|
if self._trace:
|
|
|
|
self._trace_match(newtree, frontier[1:], rtext[0])
|
|
|
|
for result in self._parse(rtext[1:], newtree, frontier[1:]):
|
|
|
|
yield result
|
|
|
|
else:
|
|
|
|
# If it's a non-matching terminal, fail.
|
|
|
|
if self._trace:
|
|
|
|
self._trace_backtrack(tree, frontier, rtext[:1])
|
|
|
|
|
|
|
|
def _expand(self, remaining_text, tree, frontier, production=None):
|
|
|
|
"""
|
|
|
|
:rtype: iter(Tree)
|
|
|
|
:return: An iterator of all parses that can be generated by
|
|
|
|
expanding the first element of ``frontier`` with
|
|
|
|
``production``. In particular, if the first element of
|
|
|
|
``frontier`` is a subtree whose node type is equal to
|
|
|
|
``production``'s left hand side, then add a child to that
|
|
|
|
subtree for each element of ``production``'s right hand
|
|
|
|
side; and return all parses that can be generated by
|
|
|
|
matching and expanding the remaining elements of
|
|
|
|
``frontier``. If the first element of ``frontier`` is not a
|
|
|
|
subtree whose node type is equal to ``production``'s left
|
|
|
|
hand side, then return an empty list. If ``production`` is
|
|
|
|
not specified, then return a list of all parses that can
|
|
|
|
be generated by expanding the first element of ``frontier``
|
|
|
|
with *any* CFG production.
|
|
|
|
|
|
|
|
:type tree: Tree
|
|
|
|
:param tree: A partial structure for the text that is
|
|
|
|
currently being parsed. The elements of ``tree``
|
|
|
|
that are specified by ``frontier`` have not yet been
|
|
|
|
expanded or matched.
|
|
|
|
:type remaining_text: list(str)
|
|
|
|
:param remaining_text: The portion of the text that is not yet
|
|
|
|
covered by ``tree``.
|
|
|
|
:type frontier: list(tuple(int))
|
|
|
|
:param frontier: A list of the locations within ``tree`` of
|
|
|
|
all subtrees that have not yet been expanded, and all
|
|
|
|
leaves that have not yet been matched.
|
|
|
|
"""
|
|
|
|
|
|
|
|
if production is None:
|
|
|
|
productions = self._grammar.productions()
|
|
|
|
else:
|
|
|
|
productions = [production]
|
|
|
|
|
|
|
|
for production in productions:
|
|
|
|
lhs = production.lhs().symbol()
|
|
|
|
if lhs == tree[frontier[0]].label():
|
|
|
|
subtree = self._production_to_tree(production)
|
|
|
|
if frontier[0] == ():
|
|
|
|
newtree = subtree
|
|
|
|
else:
|
|
|
|
newtree = tree.copy(deep=True)
|
|
|
|
newtree[frontier[0]] = subtree
|
|
|
|
new_frontier = [
|
|
|
|
frontier[0] + (i,) for i in range(len(production.rhs()))
|
|
|
|
]
|
|
|
|
if self._trace:
|
|
|
|
self._trace_expand(newtree, new_frontier, production)
|
|
|
|
for result in self._parse(
|
|
|
|
remaining_text, newtree, new_frontier + frontier[1:]
|
|
|
|
):
|
|
|
|
yield result
|
|
|
|
|
|
|
|
def _production_to_tree(self, production):
|
|
|
|
"""
|
|
|
|
:rtype: Tree
|
|
|
|
:return: The Tree that is licensed by ``production``.
|
|
|
|
In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
|
|
|
|
return a tree that has a node ``lhs.symbol``, and
|
|
|
|
``n`` children. For each nonterminal element
|
|
|
|
``elt[i]`` in the production, the tree token has a
|
|
|
|
childless subtree with node value ``elt[i].symbol``; and
|
|
|
|
for each terminal element ``elt[j]``, the tree token has
|
|
|
|
a leaf token with type ``elt[j]``.
|
|
|
|
|
|
|
|
:param production: The CFG production that licenses the tree
|
|
|
|
token that should be returned.
|
|
|
|
:type production: Production
|
|
|
|
"""
|
|
|
|
children = []
|
|
|
|
for elt in production.rhs():
|
|
|
|
if isinstance(elt, Nonterminal):
|
|
|
|
children.append(Tree(elt.symbol(), []))
|
|
|
|
else:
|
|
|
|
# This will be matched.
|
|
|
|
children.append(elt)
|
|
|
|
return Tree(production.lhs().symbol(), children)
|
|
|
|
|
|
|
|
def trace(self, trace=2):
|
|
|
|
"""
|
|
|
|
Set the level of tracing output that should be generated when
|
|
|
|
parsing a text.
|
|
|
|
|
|
|
|
:type trace: int
|
|
|
|
:param trace: The trace level. A trace level of ``0`` will
|
|
|
|
generate no tracing output; and higher trace levels will
|
|
|
|
produce more verbose tracing output.
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
self._trace = trace
|
|
|
|
|
|
|
|
def _trace_fringe(self, tree, treeloc=None):
|
|
|
|
"""
|
|
|
|
Print trace output displaying the fringe of ``tree``. The
|
|
|
|
fringe of ``tree`` consists of all of its leaves and all of
|
|
|
|
its childless subtrees.
|
|
|
|
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
|
|
|
|
if treeloc == ():
|
|
|
|
print("*", end=" ")
|
|
|
|
if isinstance(tree, Tree):
|
|
|
|
if len(tree) == 0:
|
|
|
|
print(repr(Nonterminal(tree.label())), end=" ")
|
|
|
|
for i in range(len(tree)):
|
|
|
|
if treeloc is not None and i == treeloc[0]:
|
|
|
|
self._trace_fringe(tree[i], treeloc[1:])
|
|
|
|
else:
|
|
|
|
self._trace_fringe(tree[i])
|
|
|
|
else:
|
|
|
|
print(repr(tree), end=" ")
|
|
|
|
|
|
|
|
def _trace_tree(self, tree, frontier, operation):
|
|
|
|
"""
|
|
|
|
Print trace output displaying the parser's current state.
|
|
|
|
|
|
|
|
:param operation: A character identifying the operation that
|
|
|
|
generated the current state.
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
if self._trace == 2:
|
|
|
|
print(" %c [" % operation, end=" ")
|
|
|
|
else:
|
|
|
|
print(" [", end=" ")
|
|
|
|
if len(frontier) > 0:
|
|
|
|
self._trace_fringe(tree, frontier[0])
|
|
|
|
else:
|
|
|
|
self._trace_fringe(tree)
|
|
|
|
print("]")
|
|
|
|
|
|
|
|
def _trace_start(self, tree, frontier, text):
|
|
|
|
print("Parsing %r" % " ".join(text))
|
|
|
|
if self._trace > 2:
|
|
|
|
print("Start:")
|
|
|
|
if self._trace > 1:
|
|
|
|
self._trace_tree(tree, frontier, " ")
|
|
|
|
|
|
|
|
def _trace_expand(self, tree, frontier, production):
|
|
|
|
if self._trace > 2:
|
|
|
|
print("Expand: %s" % production)
|
|
|
|
if self._trace > 1:
|
|
|
|
self._trace_tree(tree, frontier, "E")
|
|
|
|
|
|
|
|
def _trace_match(self, tree, frontier, tok):
|
|
|
|
if self._trace > 2:
|
|
|
|
print("Match: %r" % tok)
|
|
|
|
if self._trace > 1:
|
|
|
|
self._trace_tree(tree, frontier, "M")
|
|
|
|
|
|
|
|
def _trace_succeed(self, tree, frontier):
|
|
|
|
if self._trace > 2:
|
|
|
|
print("GOOD PARSE:")
|
|
|
|
if self._trace == 1:
|
|
|
|
print("Found a parse:\n%s" % tree)
|
|
|
|
if self._trace > 1:
|
|
|
|
self._trace_tree(tree, frontier, "+")
|
|
|
|
|
|
|
|
def _trace_backtrack(self, tree, frontier, toks=None):
|
|
|
|
if self._trace > 2:
|
|
|
|
if toks:
|
|
|
|
print("Backtrack: %r match failed" % toks[0])
|
|
|
|
else:
|
|
|
|
print("Backtrack")
|
|
|
|
|
|
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
|
|
## Stepping Recursive Descent Parser
|
|
|
|
##//////////////////////////////////////////////////////
|
|
|
|
class SteppingRecursiveDescentParser(RecursiveDescentParser):
|
|
|
|
"""
|
|
|
|
A ``RecursiveDescentParser`` that allows you to step through the
|
|
|
|
parsing process, performing a single operation at a time.
|
|
|
|
|
|
|
|
The ``initialize`` method is used to start parsing a text.
|
|
|
|
``expand`` expands the first element on the frontier using a single
|
|
|
|
CFG production, and ``match`` matches the first element on the
|
|
|
|
frontier against the next text token. ``backtrack`` undoes the most
|
|
|
|
recent expand or match operation. ``step`` performs a single
|
|
|
|
expand, match, or backtrack operation. ``parses`` returns the set
|
|
|
|
of parses that have been found by the parser.
|
|
|
|
|
|
|
|
:ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
|
|
|
|
containing the previous states of the parser. This history is
|
|
|
|
used to implement the ``backtrack`` operation.
|
|
|
|
:ivar _tried_e: A record of all productions that have been tried
|
|
|
|
for a given tree. This record is used by ``expand`` to perform
|
|
|
|
the next untried production.
|
|
|
|
:ivar _tried_m: A record of what tokens have been matched for a
|
|
|
|
given tree. This record is used by ``step`` to decide whether
|
|
|
|
or not to match a token.
|
|
|
|
:see: ``nltk.grammar``
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, grammar, trace=0):
|
|
|
|
super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
|
|
|
|
self._rtext = None
|
|
|
|
self._tree = None
|
|
|
|
self._frontier = [()]
|
|
|
|
self._tried_e = {}
|
|
|
|
self._tried_m = {}
|
|
|
|
self._history = []
|
|
|
|
self._parses = []
|
|
|
|
|
|
|
|
# [XX] TEMPORARY HACK WARNING! This should be replaced with
|
|
|
|
# something nicer when we get the chance.
|
|
|
|
def _freeze(self, tree):
|
|
|
|
c = tree.copy()
|
|
|
|
# for pos in c.treepositions('leaves'):
|
|
|
|
# c[pos] = c[pos].freeze()
|
|
|
|
return ImmutableTree.convert(c)
|
|
|
|
|
|
|
|
def parse(self, tokens):
|
|
|
|
tokens = list(tokens)
|
|
|
|
self.initialize(tokens)
|
|
|
|
while self.step() is not None:
|
|
|
|
pass
|
|
|
|
return self.parses()
|
|
|
|
|
|
|
|
def initialize(self, tokens):
|
|
|
|
"""
|
|
|
|
Start parsing a given text. This sets the parser's tree to
|
|
|
|
the start symbol, its frontier to the root node, and its
|
|
|
|
remaining text to ``token['SUBTOKENS']``.
|
|
|
|
"""
|
|
|
|
|
|
|
|
self._rtext = tokens
|
|
|
|
start = self._grammar.start().symbol()
|
|
|
|
self._tree = Tree(start, [])
|
|
|
|
self._frontier = [()]
|
|
|
|
self._tried_e = {}
|
|
|
|
self._tried_m = {}
|
|
|
|
self._history = []
|
|
|
|
self._parses = []
|
|
|
|
if self._trace:
|
|
|
|
self._trace_start(self._tree, self._frontier, self._rtext)
|
|
|
|
|
|
|
|
def remaining_text(self):
|
|
|
|
"""
|
|
|
|
:return: The portion of the text that is not yet covered by the
|
|
|
|
tree.
|
|
|
|
:rtype: list(str)
|
|
|
|
"""
|
|
|
|
return self._rtext
|
|
|
|
|
|
|
|
def frontier(self):
|
|
|
|
"""
|
|
|
|
:return: A list of the tree locations of all subtrees that
|
|
|
|
have not yet been expanded, and all leaves that have not
|
|
|
|
yet been matched.
|
|
|
|
:rtype: list(tuple(int))
|
|
|
|
"""
|
|
|
|
return self._frontier
|
|
|
|
|
|
|
|
def tree(self):
|
|
|
|
"""
|
|
|
|
:return: A partial structure for the text that is
|
|
|
|
currently being parsed. The elements specified by the
|
|
|
|
frontier have not yet been expanded or matched.
|
|
|
|
:rtype: Tree
|
|
|
|
"""
|
|
|
|
return self._tree
|
|
|
|
|
|
|
|
def step(self):
|
|
|
|
"""
|
|
|
|
Perform a single parsing operation. If an untried match is
|
|
|
|
possible, then perform the match, and return the matched
|
|
|
|
token. If an untried expansion is possible, then perform the
|
|
|
|
expansion, and return the production that it is based on. If
|
|
|
|
backtracking is possible, then backtrack, and return True.
|
|
|
|
Otherwise, return None.
|
|
|
|
|
|
|
|
:return: None if no operation was performed; a token if a match
|
|
|
|
was performed; a production if an expansion was performed;
|
|
|
|
and True if a backtrack operation was performed.
|
|
|
|
:rtype: Production or String or bool
|
|
|
|
"""
|
|
|
|
# Try matching (if we haven't already)
|
|
|
|
if self.untried_match():
|
|
|
|
token = self.match()
|
|
|
|
if token is not None:
|
|
|
|
return token
|
|
|
|
|
|
|
|
# Try expanding.
|
|
|
|
production = self.expand()
|
|
|
|
if production is not None:
|
|
|
|
return production
|
|
|
|
|
|
|
|
# Try backtracking
|
|
|
|
if self.backtrack():
|
|
|
|
self._trace_backtrack(self._tree, self._frontier)
|
|
|
|
return True
|
|
|
|
|
|
|
|
# Nothing left to do.
|
|
|
|
return None
|
|
|
|
|
|
|
|
def expand(self, production=None):
|
|
|
|
"""
|
|
|
|
Expand the first element of the frontier. In particular, if
|
|
|
|
the first element of the frontier is a subtree whose node type
|
|
|
|
is equal to ``production``'s left hand side, then add a child
|
|
|
|
to that subtree for each element of ``production``'s right hand
|
|
|
|
side. If ``production`` is not specified, then use the first
|
|
|
|
untried expandable production. If all expandable productions
|
|
|
|
have been tried, do nothing.
|
|
|
|
|
|
|
|
:return: The production used to expand the frontier, if an
|
|
|
|
expansion was performed. If no expansion was performed,
|
|
|
|
return None.
|
|
|
|
:rtype: Production or None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Make sure we *can* expand.
|
|
|
|
if len(self._frontier) == 0:
|
|
|
|
return None
|
|
|
|
if not isinstance(self._tree[self._frontier[0]], Tree):
|
|
|
|
return None
|
|
|
|
|
|
|
|
# If they didn't specify a production, check all untried ones.
|
|
|
|
if production is None:
|
|
|
|
productions = self.untried_expandable_productions()
|
|
|
|
else:
|
|
|
|
productions = [production]
|
|
|
|
|
|
|
|
parses = []
|
|
|
|
for prod in productions:
|
|
|
|
# Record that we've tried this production now.
|
|
|
|
self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
|
|
|
|
|
|
|
|
# Try expanding.
|
|
|
|
for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
|
|
|
|
return prod
|
|
|
|
|
|
|
|
# We didn't expand anything.
|
|
|
|
return None
|
|
|
|
|
|
|
|
def match(self):
|
|
|
|
"""
|
|
|
|
Match the first element of the frontier. In particular, if
|
|
|
|
the first element of the frontier has the same type as the
|
|
|
|
next text token, then substitute the text token into the tree.
|
|
|
|
|
|
|
|
:return: The token matched, if a match operation was
|
|
|
|
performed. If no match was performed, return None
|
|
|
|
:rtype: str or None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Record that we've tried matching this token.
|
|
|
|
tok = self._rtext[0]
|
|
|
|
self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
|
|
|
|
|
|
|
|
# Make sure we *can* match.
|
|
|
|
if len(self._frontier) == 0:
|
|
|
|
return None
|
|
|
|
if isinstance(self._tree[self._frontier[0]], Tree):
|
|
|
|
return None
|
|
|
|
|
|
|
|
for _result in self._match(self._rtext, self._tree, self._frontier):
|
|
|
|
# Return the token we just matched.
|
|
|
|
return self._history[-1][0][0]
|
|
|
|
return None
|
|
|
|
|
|
|
|
def backtrack(self):
|
|
|
|
"""
|
|
|
|
Return the parser to its state before the most recent
|
|
|
|
match or expand operation. Calling ``undo`` repeatedly return
|
|
|
|
the parser to successively earlier states. If no match or
|
|
|
|
expand operations have been performed, ``undo`` will make no
|
|
|
|
changes.
|
|
|
|
|
|
|
|
:return: true if an operation was successfully undone.
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
|
|
|
if len(self._history) == 0:
|
|
|
|
return False
|
|
|
|
(self._rtext, self._tree, self._frontier) = self._history.pop()
|
|
|
|
return True
|
|
|
|
|
|
|
|
def expandable_productions(self):
|
|
|
|
"""
|
|
|
|
:return: A list of all the productions for which expansions
|
|
|
|
are available for the current parser state.
|
|
|
|
:rtype: list(Production)
|
|
|
|
"""
|
|
|
|
# Make sure we *can* expand.
|
|
|
|
if len(self._frontier) == 0:
|
|
|
|
return []
|
|
|
|
frontier_child = self._tree[self._frontier[0]]
|
|
|
|
if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
|
|
|
|
return []
|
|
|
|
|
|
|
|
return [
|
|
|
|
p
|
|
|
|
for p in self._grammar.productions()
|
|
|
|
if p.lhs().symbol() == frontier_child.label()
|
|
|
|
]
|
|
|
|
|
|
|
|
def untried_expandable_productions(self):
|
|
|
|
"""
|
|
|
|
:return: A list of all the untried productions for which
|
|
|
|
expansions are available for the current parser state.
|
|
|
|
:rtype: list(Production)
|
|
|
|
"""
|
|
|
|
|
|
|
|
tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
|
|
|
|
return [p for p in self.expandable_productions() if p not in tried_expansions]
|
|
|
|
|
|
|
|
def untried_match(self):
|
|
|
|
"""
|
|
|
|
:return: Whether the first element of the frontier is a token
|
|
|
|
that has not yet been matched.
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
|
|
|
|
|
|
|
if len(self._rtext) == 0:
|
|
|
|
return False
|
|
|
|
tried_matches = self._tried_m.get(self._freeze(self._tree), [])
|
|
|
|
return self._rtext[0] not in tried_matches
|
|
|
|
|
|
|
|
def currently_complete(self):
|
|
|
|
"""
|
|
|
|
:return: Whether the parser's current state represents a
|
|
|
|
complete parse.
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
|
|
|
return len(self._frontier) == 0 and len(self._rtext) == 0
|
|
|
|
|
|
|
|
def _parse(self, remaining_text, tree, frontier):
|
|
|
|
"""
|
|
|
|
A stub version of ``_parse`` that sets the parsers current
|
|
|
|
state to the given arguments. In ``RecursiveDescentParser``,
|
|
|
|
the ``_parse`` method is used to recursively continue parsing a
|
|
|
|
text. ``SteppingRecursiveDescentParser`` overrides it to
|
|
|
|
capture these recursive calls. It records the parser's old
|
|
|
|
state in the history (to allow for backtracking), and updates
|
|
|
|
the parser's new state using the given arguments. Finally, it
|
|
|
|
returns ``[1]``, which is used by ``match`` and ``expand`` to
|
|
|
|
detect whether their operations were successful.
|
|
|
|
|
|
|
|
:return: ``[1]``
|
|
|
|
:rtype: list of int
|
|
|
|
"""
|
|
|
|
self._history.append((self._rtext, self._tree, self._frontier))
|
|
|
|
self._rtext = remaining_text
|
|
|
|
self._tree = tree
|
|
|
|
self._frontier = frontier
|
|
|
|
|
|
|
|
# Is it a good parse? If so, record it.
|
|
|
|
if len(frontier) == 0 and len(remaining_text) == 0:
|
|
|
|
self._parses.append(tree)
|
|
|
|
self._trace_succeed(self._tree, self._frontier)
|
|
|
|
|
|
|
|
return [1]
|
|
|
|
|
|
|
|
def parses(self):
|
|
|
|
"""
|
|
|
|
:return: An iterator of the parses that have been found by this
|
|
|
|
parser so far.
|
|
|
|
:rtype: list of Tree
|
|
|
|
"""
|
|
|
|
return iter(self._parses)
|
|
|
|
|
|
|
|
def set_grammar(self, grammar):
|
|
|
|
"""
|
|
|
|
Change the grammar used to parse texts.
|
|
|
|
|
|
|
|
:param grammar: The new grammar.
|
|
|
|
:type grammar: CFG
|
|
|
|
"""
|
|
|
|
self._grammar = grammar
|
|
|
|
|
|
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
|
|
## Demonstration Code
|
|
|
|
##//////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
|
|
def demo():
|
|
|
|
"""
|
|
|
|
A demonstration of the recursive descent parser.
|
|
|
|
"""
|
|
|
|
|
|
|
|
from nltk import parse, CFG
|
|
|
|
|
|
|
|
grammar = CFG.fromstring(
|
|
|
|
"""
|
|
|
|
S -> NP VP
|
|
|
|
NP -> Det N | Det N PP
|
|
|
|
VP -> V NP | V NP PP
|
|
|
|
PP -> P NP
|
|
|
|
NP -> 'I'
|
|
|
|
N -> 'man' | 'park' | 'telescope' | 'dog'
|
|
|
|
Det -> 'the' | 'a'
|
|
|
|
P -> 'in' | 'with'
|
|
|
|
V -> 'saw'
|
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
|
|
|
for prod in grammar.productions():
|
|
|
|
print(prod)
|
|
|
|
|
|
|
|
sent = "I saw a man in the park".split()
|
|
|
|
parser = parse.RecursiveDescentParser(grammar, trace=2)
|
|
|
|
for p in parser.parse(sent):
|
|
|
|
print(p)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
demo()
|