You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

556 lines
18 KiB
Python

# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Incremental Earley Chart Parser
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Rob Speer <rspeer@mit.edu>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jean Mark Gawron <gawron@mail.sdsu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Data classes and parser implementations for *incremental* chart
parsers, which use dynamic programming to efficiently parse a text.
A "chart parser" derives parse trees for a text by iteratively adding
\"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree
structure for a subsequence of the text. The "chart" is a
\"blackboard\" for composing and combining these hypotheses.
A parser is "incremental", if it guarantees that for all i, j where i < j,
all edges ending at i are built before any edges ending at j.
This is appealing for, say, speech recognizer hypothesis filtering.
The main parser class is ``EarleyChartParser``, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""
from time import perf_counter
from nltk.parse.chart import (
Chart,
ChartParser,
EdgeI,
LeafEdge,
LeafInitRule,
BottomUpPredictRule,
BottomUpPredictCombineRule,
TopDownInitRule,
SingleEdgeFundamentalRule,
EmptyPredictRule,
CachedTopDownPredictRule,
FilteredSingleEdgeFundamentalRule,
FilteredBottomUpPredictCombineRule,
)
from nltk.parse.featurechart import (
FeatureChart,
FeatureChartParser,
FeatureTopDownInitRule,
FeatureTopDownPredictRule,
FeatureEmptyPredictRule,
FeatureBottomUpPredictRule,
FeatureBottomUpPredictCombineRule,
FeatureSingleEdgeFundamentalRule,
)
# ////////////////////////////////////////////////////////////
# Incremental Chart
# ////////////////////////////////////////////////////////////
class IncrementalChart(Chart):
def initialize(self):
# A sequence of edge lists contained in this chart.
self._edgelists = tuple([] for x in self._positions())
# The set of child pointer lists associated with each edge.
self._edge_to_cpls = {}
# Indexes mapping attribute values to lists of edges
# (used by select()).
self._indexes = {}
def edges(self):
return list(self.iteredges())
def iteredges(self):
return (edge for edgelist in self._edgelists for edge in edgelist)
def select(self, end, **restrictions):
edgelist = self._edgelists[end]
# If there are no restrictions, then return all edges.
if restrictions == {}:
return iter(edgelist)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
restr_keys = tuple(restr_keys)
# If it doesn't exist, then create it.
if restr_keys not in self._indexes:
self._add_index(restr_keys)
vals = tuple(restrictions[key] for key in restr_keys)
return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys):
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
raise ValueError("Bad restriction: %s" % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
# Add all existing edges to the index.
for end, edgelist in enumerate(self._edgelists):
this_index = index[end]
for edge in edgelist:
vals = tuple(getattr(edge, key)() for key in restr_keys)
this_index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
end = edge.end()
for (restr_keys, index) in self._indexes.items():
vals = tuple(getattr(edge, key)() for key in restr_keys)
index[end].setdefault(vals, []).append(edge)
def _append_edge(self, edge):
self._edgelists[edge.end()].append(edge)
def _positions(self):
return range(self.num_leaves() + 1)
class FeatureIncrementalChart(IncrementalChart, FeatureChart):
def select(self, end, **restrictions):
edgelist = self._edgelists[end]
# If there are no restrictions, then return all edges.
if restrictions == {}:
return iter(edgelist)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
restr_keys = tuple(restr_keys)
# If it doesn't exist, then create it.
if restr_keys not in self._indexes:
self._add_index(restr_keys)
vals = tuple(
self._get_type_if_possible(restrictions[key]) for key in restr_keys
)
return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys):
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
raise ValueError("Bad restriction: %s" % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
# Add all existing edges to the index.
for end, edgelist in enumerate(self._edgelists):
this_index = index[end]
for edge in edgelist:
vals = tuple(
self._get_type_if_possible(getattr(edge, key)())
for key in restr_keys
)
this_index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
end = edge.end()
for (restr_keys, index) in self._indexes.items():
vals = tuple(
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
)
index[end].setdefault(vals, []).append(edge)
# ////////////////////////////////////////////////////////////
# Incremental CFG Rules
# ////////////////////////////////////////////////////////////
class CompleteFundamentalRule(SingleEdgeFundamentalRule):
def _apply_incomplete(self, chart, grammar, left_edge):
end = left_edge.end()
# When the chart is incremental, we only have to look for
# empty complete edges here.
for right_edge in chart.select(
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
):
new_edge = left_edge.move_dot_forward(right_edge.end())
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
class CompleterRule(CompleteFundamentalRule):
_fundamental_rule = CompleteFundamentalRule()
def apply(self, chart, grammar, edge):
if not isinstance(edge, LeafEdge):
for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
yield new_edge
class ScannerRule(CompleteFundamentalRule):
_fundamental_rule = CompleteFundamentalRule()
def apply(self, chart, grammar, edge):
if isinstance(edge, LeafEdge):
for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
yield new_edge
class PredictorRule(CachedTopDownPredictRule):
pass
class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
def apply(self, chart, grammar, edge):
# Since the Filtered rule only works for grammars without empty productions,
# we only have to bother with complete edges here.
if edge.is_complete():
for new_edge in self._apply_complete(chart, grammar, edge):
yield new_edge
# ////////////////////////////////////////////////////////////
# Incremental FCFG Rules
# ////////////////////////////////////////////////////////////
class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
def _apply_incomplete(self, chart, grammar, left_edge):
fr = self._fundamental_rule
end = left_edge.end()
# When the chart is incremental, we only have to look for
# empty complete edges here.
for right_edge in chart.select(
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
):
for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
yield new_edge
class FeatureCompleterRule(CompleterRule):
_fundamental_rule = FeatureCompleteFundamentalRule()
class FeatureScannerRule(ScannerRule):
_fundamental_rule = FeatureCompleteFundamentalRule()
class FeaturePredictorRule(FeatureTopDownPredictRule):
pass
# ////////////////////////////////////////////////////////////
# Incremental CFG Chart Parsers
# ////////////////////////////////////////////////////////////
EARLEY_STRATEGY = [
LeafInitRule(),
TopDownInitRule(),
CompleterRule(),
ScannerRule(),
PredictorRule(),
]
TD_INCREMENTAL_STRATEGY = [
LeafInitRule(),
TopDownInitRule(),
CachedTopDownPredictRule(),
CompleteFundamentalRule(),
]
BU_INCREMENTAL_STRATEGY = [
LeafInitRule(),
EmptyPredictRule(),
BottomUpPredictRule(),
CompleteFundamentalRule(),
]
BU_LC_INCREMENTAL_STRATEGY = [
LeafInitRule(),
EmptyPredictRule(),
BottomUpPredictCombineRule(),
CompleteFundamentalRule(),
]
LC_INCREMENTAL_STRATEGY = [
LeafInitRule(),
FilteredBottomUpPredictCombineRule(),
FilteredCompleteFundamentalRule(),
]
class IncrementalChartParser(ChartParser):
"""
An *incremental* chart parser implementing Jay Earley's
parsing algorithm:
| For each index end in [0, 1, ..., N]:
| For each edge such that edge.end = end:
| If edge is incomplete and edge.next is not a part of speech:
| Apply PredictorRule to edge
| If edge is incomplete and edge.next is a part of speech:
| Apply ScannerRule to edge
| If edge is complete:
| Apply CompleterRule to edge
| Return any complete parses in the chart
"""
def __init__(
self,
grammar,
strategy=BU_LC_INCREMENTAL_STRATEGY,
trace=0,
trace_chart_width=50,
chart_class=IncrementalChart,
):
"""
Create a new Earley chart parser, that uses ``grammar`` to
parse texts.
:type grammar: CFG
:param grammar: The grammar used to parse texts.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing
output.
:type trace_chart_width: int
:param trace_chart_width: The default total width reserved for
the chart in trace output. The remainder of each line will
be used to display edges.
:param chart_class: The class that should be used to create
the charts used by this parser.
"""
self._grammar = grammar
self._trace = trace
self._trace_chart_width = trace_chart_width
self._chart_class = chart_class
self._axioms = []
self._inference_rules = []
for rule in strategy:
if rule.NUM_EDGES == 0:
self._axioms.append(rule)
elif rule.NUM_EDGES == 1:
self._inference_rules.append(rule)
else:
raise ValueError(
"Incremental inference rules must have " "NUM_EDGES == 0 or 1"
)
def chart_parse(self, tokens, trace=None):
if trace is None:
trace = self._trace
trace_new_edges = self._trace_new_edges
tokens = list(tokens)
self._grammar.check_coverage(tokens)
chart = self._chart_class(tokens)
grammar = self._grammar
# Width, for printing trace edges.
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
if trace:
print(chart.pretty_format_leaves(trace_edge_width))
for axiom in self._axioms:
new_edges = list(axiom.apply(chart, grammar))
trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
inference_rules = self._inference_rules
for end in range(chart.num_leaves() + 1):
if trace > 1:
print("\n* Processing queue:", end, "\n")
agenda = list(chart.select(end=end))
while agenda:
edge = agenda.pop()
for rule in inference_rules:
new_edges = list(rule.apply(chart, grammar, edge))
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
for new_edge in new_edges:
if new_edge.end() == end:
agenda.append(new_edge)
return chart
class EarleyChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
class IncrementalTopDownChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(
self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
)
class IncrementalBottomUpChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(
self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
)
class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(
self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
)
class IncrementalLeftCornerChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
if not grammar.is_nonempty():
raise ValueError(
"IncrementalLeftCornerParser only works for grammars "
"without empty productions."
)
IncrementalChartParser.__init__(
self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
)
# ////////////////////////////////////////////////////////////
# Incremental FCFG Chart Parsers
# ////////////////////////////////////////////////////////////
EARLEY_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureTopDownInitRule(),
FeatureCompleterRule(),
FeatureScannerRule(),
FeaturePredictorRule(),
]
TD_INCREMENTAL_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureTopDownInitRule(),
FeatureTopDownPredictRule(),
FeatureCompleteFundamentalRule(),
]
BU_INCREMENTAL_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureEmptyPredictRule(),
FeatureBottomUpPredictRule(),
FeatureCompleteFundamentalRule(),
]
BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureEmptyPredictRule(),
FeatureBottomUpPredictCombineRule(),
FeatureCompleteFundamentalRule(),
]
class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
def __init__(
self,
grammar,
strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
trace_chart_width=20,
chart_class=FeatureIncrementalChart,
**parser_args
):
IncrementalChartParser.__init__(
self,
grammar,
strategy=strategy,
trace_chart_width=trace_chart_width,
chart_class=chart_class,
**parser_args
)
class FeatureEarleyChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
)
class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
)
class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
)
class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
)
# ////////////////////////////////////////////////////////////
# Demonstration
# ////////////////////////////////////////////////////////////
def demo(
print_times=True,
print_grammar=False,
print_trees=True,
trace=2,
sent="I saw John with a dog with my cookie",
numparses=5,
):
"""
A demonstration of the Earley parsers.
"""
import sys, time
from nltk.parse.chart import demo_grammar
# The grammar for ChartParser and SteppingChartParser:
grammar = demo_grammar()
if print_grammar:
print("* Grammar")
print(grammar)
# Tokenize the sample sentence.
print("* Sentence:")
print(sent)
tokens = sent.split()
print(tokens)
print()
# Do the parsing.
earley = EarleyChartParser(grammar, trace=trace)
t = perf_counter()
chart = earley.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
t = perf_counter() - t
# Print results.
if numparses:
assert len(parses) == numparses, "Not all parses found"
if print_trees:
for tree in parses:
print(tree)
else:
print("Nr trees:", len(parses))
if print_times:
print("Time:", t)
if __name__ == "__main__":
demo()