You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1489 lines
54 KiB
Python
1489 lines
54 KiB
Python
# Natural Language Toolkit: Regular Expression Chunkers
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
from __future__ import print_function, unicode_literals
|
|
from __future__ import division
|
|
|
|
import re
|
|
|
|
from six import string_types
|
|
|
|
from nltk.tree import Tree
|
|
from nltk.chunk.api import ChunkParserI
|
|
from nltk.compat import python_2_unicode_compatible, unicode_repr
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## ChunkString
|
|
##//////////////////////////////////////////////////////
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class ChunkString(object):
|
|
"""
|
|
A string-based encoding of a particular chunking of a text.
|
|
Internally, the ``ChunkString`` class uses a single string to
|
|
encode the chunking of the input text. This string contains a
|
|
sequence of angle-bracket delimited tags, with chunking indicated
|
|
by braces. An example of this encoding is::
|
|
|
|
{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
|
|
|
|
``ChunkString`` are created from tagged texts (i.e., lists of
|
|
``tokens`` whose type is ``TaggedType``). Initially, nothing is
|
|
chunked.
|
|
|
|
The chunking of a ``ChunkString`` can be modified with the ``xform()``
|
|
method, which uses a regular expression to transform the string
|
|
representation. These transformations should only add and remove
|
|
braces; they should *not* modify the sequence of angle-bracket
|
|
delimited tags.
|
|
|
|
:type _str: str
|
|
:ivar _str: The internal string representation of the text's
|
|
encoding. This string representation contains a sequence of
|
|
angle-bracket delimited tags, with chunking indicated by
|
|
braces. An example of this encoding is::
|
|
|
|
{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
|
|
|
|
:type _pieces: list(tagged tokens and chunks)
|
|
:ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``.
|
|
:ivar _debug: The debug level. See the constructor docs.
|
|
|
|
:cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
|
|
will only match positions that are in chunks.
|
|
:cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
|
|
will only match positions that are in chinks.
|
|
"""
|
|
|
|
CHUNK_TAG_CHAR = r'[^\{\}<>]'
|
|
CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
|
|
|
|
IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
|
|
IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
|
|
|
|
# These are used by _verify
|
|
_CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
|
|
_CHINK = r'(%s+?)+?' % CHUNK_TAG
|
|
_VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
|
|
_BRACKETS = re.compile('[^\{\}]+')
|
|
_BALANCED_BRACKETS = re.compile(r'(\{\})*$')
|
|
|
|
def __init__(self, chunk_struct, debug_level=1):
|
|
"""
|
|
Construct a new ``ChunkString`` that encodes the chunking of
|
|
the text ``tagged_tokens``.
|
|
|
|
:type chunk_struct: Tree
|
|
:param chunk_struct: The chunk structure to be further chunked.
|
|
:type debug_level: int
|
|
:param debug_level: The level of debugging which should be
|
|
applied to transformations on the ``ChunkString``. The
|
|
valid levels are:
|
|
- 0: no checks
|
|
- 1: full check on to_chunkstruct
|
|
- 2: full check on to_chunkstruct and cursory check after
|
|
each transformation.
|
|
- 3: full check on to_chunkstruct and full check after
|
|
each transformation.
|
|
We recommend you use at least level 1. You should
|
|
probably use level 3 if you use any non-standard
|
|
subclasses of ``RegexpChunkRule``.
|
|
"""
|
|
self._root_label = chunk_struct.label()
|
|
self._pieces = chunk_struct[:]
|
|
tags = [self._tag(tok) for tok in self._pieces]
|
|
self._str = '<' + '><'.join(tags) + '>'
|
|
self._debug = debug_level
|
|
|
|
def _tag(self, tok):
|
|
if isinstance(tok, tuple):
|
|
return tok[1]
|
|
elif isinstance(tok, Tree):
|
|
return tok.label()
|
|
else:
|
|
raise ValueError('chunk structures must contain tagged ' 'tokens or trees')
|
|
|
|
def _verify(self, s, verify_tags):
|
|
"""
|
|
Check to make sure that ``s`` still corresponds to some chunked
|
|
version of ``_pieces``.
|
|
|
|
:type verify_tags: bool
|
|
:param verify_tags: Whether the individual tags should be
|
|
checked. If this is false, ``_verify`` will check to make
|
|
sure that ``_str`` encodes a chunked version of *some*
|
|
list of tokens. If this is true, then ``_verify`` will
|
|
check to make sure that the tags in ``_str`` match those in
|
|
``_pieces``.
|
|
|
|
:raise ValueError: if the internal string representation of
|
|
this ``ChunkString`` is invalid or not consistent with _pieces.
|
|
"""
|
|
# Check overall form
|
|
if not ChunkString._VALID.match(s):
|
|
raise ValueError(
|
|
'Transformation generated invalid ' 'chunkstring:\n %s' % s
|
|
)
|
|
|
|
# Check that parens are balanced. If the string is long, we
|
|
# have to do this in pieces, to avoid a maximum recursion
|
|
# depth limit for regular expressions.
|
|
brackets = ChunkString._BRACKETS.sub('', s)
|
|
for i in range(1 + len(brackets) // 5000):
|
|
substr = brackets[i * 5000 : i * 5000 + 5000]
|
|
if not ChunkString._BALANCED_BRACKETS.match(substr):
|
|
raise ValueError(
|
|
'Transformation generated invalid ' 'chunkstring:\n %s' % s
|
|
)
|
|
|
|
if verify_tags <= 0:
|
|
return
|
|
|
|
tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
|
|
tags2 = [self._tag(piece) for piece in self._pieces]
|
|
if tags1 != tags2:
|
|
raise ValueError(
|
|
'Transformation generated invalid ' 'chunkstring: tag changed'
|
|
)
|
|
|
|
def to_chunkstruct(self, chunk_label='CHUNK'):
|
|
"""
|
|
Return the chunk structure encoded by this ``ChunkString``.
|
|
|
|
:rtype: Tree
|
|
:raise ValueError: If a transformation has generated an
|
|
invalid chunkstring.
|
|
"""
|
|
if self._debug > 0:
|
|
self._verify(self._str, 1)
|
|
|
|
# Use this alternating list to create the chunkstruct.
|
|
pieces = []
|
|
index = 0
|
|
piece_in_chunk = 0
|
|
for piece in re.split('[{}]', self._str):
|
|
|
|
# Find the list of tokens contained in this piece.
|
|
length = piece.count('<')
|
|
subsequence = self._pieces[index : index + length]
|
|
|
|
# Add this list of tokens to our pieces.
|
|
if piece_in_chunk:
|
|
pieces.append(Tree(chunk_label, subsequence))
|
|
else:
|
|
pieces += subsequence
|
|
|
|
# Update index, piece_in_chunk
|
|
index += length
|
|
piece_in_chunk = not piece_in_chunk
|
|
|
|
return Tree(self._root_label, pieces)
|
|
|
|
def xform(self, regexp, repl):
|
|
"""
|
|
Apply the given transformation to the string encoding of this
|
|
``ChunkString``. In particular, find all occurrences that match
|
|
``regexp``, and replace them using ``repl`` (as done by
|
|
``re.sub``).
|
|
|
|
This transformation should only add and remove braces; it
|
|
should *not* modify the sequence of angle-bracket delimited
|
|
tags. Furthermore, this transformation may not result in
|
|
improper bracketing. Note, in particular, that bracketing may
|
|
not be nested.
|
|
|
|
:type regexp: str or regexp
|
|
:param regexp: A regular expression matching the substring
|
|
that should be replaced. This will typically include a
|
|
named group, which can be used by ``repl``.
|
|
:type repl: str
|
|
:param repl: An expression specifying what should replace the
|
|
matched substring. Typically, this will include a named
|
|
replacement group, specified by ``regexp``.
|
|
:rtype: None
|
|
:raise ValueError: If this transformation generated an
|
|
invalid chunkstring.
|
|
"""
|
|
# Do the actual substitution
|
|
s = re.sub(regexp, repl, self._str)
|
|
|
|
# The substitution might have generated "empty chunks"
|
|
# (substrings of the form "{}"). Remove them, so they don't
|
|
# interfere with other transformations.
|
|
s = re.sub('\{\}', '', s)
|
|
|
|
# Make sure that the transformation was legal.
|
|
if self._debug > 1:
|
|
self._verify(s, self._debug - 2)
|
|
|
|
# Commit the transformation.
|
|
self._str = s
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this ``ChunkString``.
|
|
It has the form::
|
|
|
|
<ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
|
|
|
|
:rtype: str
|
|
"""
|
|
return '<ChunkString: %s>' % unicode_repr(self._str)
|
|
|
|
def __str__(self):
|
|
"""
|
|
Return a formatted representation of this ``ChunkString``.
|
|
This representation will include extra spaces to ensure that
|
|
tags will line up with the representation of other
|
|
``ChunkStrings`` for the same text, regardless of the chunking.
|
|
|
|
:rtype: str
|
|
"""
|
|
# Add spaces to make everything line up.
|
|
str = re.sub(r'>(?!\})', r'> ', self._str)
|
|
str = re.sub(r'([^\{])<', r'\1 <', str)
|
|
if str[0] == '<':
|
|
str = ' ' + str
|
|
return str
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## Chunking Rules
|
|
##//////////////////////////////////////////////////////
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class RegexpChunkRule(object):
|
|
"""
|
|
A rule specifying how to modify the chunking in a ``ChunkString``,
|
|
using a transformational regular expression. The
|
|
``RegexpChunkRule`` class itself can be used to implement any
|
|
transformational rule based on regular expressions. There are
|
|
also a number of subclasses, which can be used to implement
|
|
simpler types of rules, based on matching regular expressions.
|
|
|
|
Each ``RegexpChunkRule`` has a regular expression and a
|
|
replacement expression. When a ``RegexpChunkRule`` is "applied"
|
|
to a ``ChunkString``, it searches the ``ChunkString`` for any
|
|
substring that matches the regular expression, and replaces it
|
|
using the replacement expression. This search/replace operation
|
|
has the same semantics as ``re.sub``.
|
|
|
|
Each ``RegexpChunkRule`` also has a description string, which
|
|
gives a short (typically less than 75 characters) description of
|
|
the purpose of the rule.
|
|
|
|
This transformation defined by this ``RegexpChunkRule`` should
|
|
only add and remove braces; it should *not* modify the sequence
|
|
of angle-bracket delimited tags. Furthermore, this transformation
|
|
may not result in nested or mismatched bracketing.
|
|
"""
|
|
|
|
def __init__(self, regexp, repl, descr):
|
|
"""
|
|
Construct a new RegexpChunkRule.
|
|
|
|
:type regexp: regexp or str
|
|
:param regexp: The regular expression for this ``RegexpChunkRule``.
|
|
When this rule is applied to a ``ChunkString``, any
|
|
substring that matches ``regexp`` will be replaced using
|
|
the replacement string ``repl``. Note that this must be a
|
|
normal regular expression, not a tag pattern.
|
|
:type repl: str
|
|
:param repl: The replacement expression for this ``RegexpChunkRule``.
|
|
When this rule is applied to a ``ChunkString``, any substring
|
|
that matches ``regexp`` will be replaced using ``repl``.
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
if isinstance(regexp, string_types):
|
|
regexp = re.compile(regexp)
|
|
self._repl = repl
|
|
self._descr = descr
|
|
self._regexp = regexp
|
|
|
|
def apply(self, chunkstr):
|
|
# Keep docstring generic so we can inherit it.
|
|
"""
|
|
Apply this rule to the given ``ChunkString``. See the
|
|
class reference documentation for a description of what it
|
|
means to apply a rule.
|
|
|
|
:type chunkstr: ChunkString
|
|
:param chunkstr: The chunkstring to which this rule is applied.
|
|
:rtype: None
|
|
:raise ValueError: If this transformation generated an
|
|
invalid chunkstring.
|
|
"""
|
|
chunkstr.xform(self._regexp, self._repl)
|
|
|
|
def descr(self):
|
|
"""
|
|
Return a short description of the purpose and/or effect of
|
|
this rule.
|
|
|
|
:rtype: str
|
|
"""
|
|
return self._descr
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return (
|
|
'<RegexpChunkRule: '
|
|
+ unicode_repr(self._regexp.pattern)
|
|
+ '->'
|
|
+ unicode_repr(self._repl)
|
|
+ '>'
|
|
)
|
|
|
|
@staticmethod
|
|
def fromstring(s):
|
|
"""
|
|
Create a RegexpChunkRule from a string description.
|
|
Currently, the following formats are supported::
|
|
|
|
{regexp} # chunk rule
|
|
}regexp{ # chink rule
|
|
regexp}{regexp # split rule
|
|
regexp{}regexp # merge rule
|
|
|
|
Where ``regexp`` is a regular expression for the rule. Any
|
|
text following the comment marker (``#``) will be used as
|
|
the rule's description:
|
|
|
|
>>> from nltk.chunk.regexp import RegexpChunkRule
|
|
>>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}')
|
|
<ChunkRule: '<DT>?<NN.*>+'>
|
|
"""
|
|
# Split off the comment (but don't split on '\#')
|
|
m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
|
|
rule = m.group('rule').strip()
|
|
comment = (m.group('comment') or '')[1:].strip()
|
|
|
|
# Pattern bodies: chunk, chink, split, merge
|
|
try:
|
|
if not rule:
|
|
raise ValueError('Empty chunk pattern')
|
|
if rule[0] == '{' and rule[-1] == '}':
|
|
return ChunkRule(rule[1:-1], comment)
|
|
elif rule[0] == '}' and rule[-1] == '{':
|
|
return ChinkRule(rule[1:-1], comment)
|
|
elif '}{' in rule:
|
|
left, right = rule.split('}{')
|
|
return SplitRule(left, right, comment)
|
|
elif '{}' in rule:
|
|
left, right = rule.split('{}')
|
|
return MergeRule(left, right, comment)
|
|
elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
|
|
left, chunk, right = re.split('[{}]', rule)
|
|
return ChunkRuleWithContext(left, chunk, right, comment)
|
|
else:
|
|
raise ValueError('Illegal chunk pattern: %s' % rule)
|
|
except (ValueError, re.error):
|
|
raise ValueError('Illegal chunk pattern: %s' % rule)
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class ChunkRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to add chunks to a ``ChunkString``, using a
|
|
matching tag pattern. When applied to a ``ChunkString``, it will
|
|
find any substring that matches this tag pattern and that is not
|
|
already part of a chunk, and create a new chunk containing that
|
|
substring.
|
|
"""
|
|
|
|
def __init__(self, tag_pattern, descr):
|
|
|
|
"""
|
|
Construct a new ``ChunkRule``.
|
|
|
|
:type tag_pattern: str
|
|
:param tag_pattern: This rule's tag pattern. When
|
|
applied to a ``ChunkString``, this rule will
|
|
chunk any substring that matches this tag pattern and that
|
|
is not already part of a chunk.
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
self._pattern = tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<chunk>%s)%s'
|
|
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
|
|
)
|
|
RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<ChunkRule: '<IN|VB.*>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return '<ChunkRule: ' + unicode_repr(self._pattern) + '>'
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class ChinkRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to remove chinks to a ``ChunkString``,
|
|
using a matching tag pattern. When applied to a
|
|
``ChunkString``, it will find any substring that matches this
|
|
tag pattern and that is contained in a chunk, and remove it
|
|
from that chunk, thus creating two new chunks.
|
|
"""
|
|
|
|
def __init__(self, tag_pattern, descr):
|
|
"""
|
|
Construct a new ``ChinkRule``.
|
|
|
|
:type tag_pattern: str
|
|
:param tag_pattern: This rule's tag pattern. When
|
|
applied to a ``ChunkString``, this rule will
|
|
find any substring that matches this tag pattern and that
|
|
is contained in a chunk, and remove it from that chunk,
|
|
thus creating two new chunks.
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
self._pattern = tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<chink>%s)%s'
|
|
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
|
|
)
|
|
RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<ChinkRule: '<IN|VB.*>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return '<ChinkRule: ' + unicode_repr(self._pattern) + '>'
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class UnChunkRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to remove chunks to a ``ChunkString``,
|
|
using a matching tag pattern. When applied to a
|
|
``ChunkString``, it will find any complete chunk that matches this
|
|
tag pattern, and un-chunk it.
|
|
"""
|
|
|
|
def __init__(self, tag_pattern, descr):
|
|
"""
|
|
Construct a new ``UnChunkRule``.
|
|
|
|
:type tag_pattern: str
|
|
:param tag_pattern: This rule's tag pattern. When
|
|
applied to a ``ChunkString``, this rule will
|
|
find any complete chunk that matches this tag pattern,
|
|
and un-chunk it.
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
self._pattern = tag_pattern
|
|
regexp = re.compile('\{(?P<chunk>%s)\}' % tag_pattern2re_pattern(tag_pattern))
|
|
RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<UnChunkRule: '<IN|VB.*>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return '<UnChunkRule: ' + unicode_repr(self._pattern) + '>'
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class MergeRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to merge chunks in a ``ChunkString``, using
|
|
two matching tag patterns: a left pattern, and a right pattern.
|
|
When applied to a ``ChunkString``, it will find any chunk whose end
|
|
matches left pattern, and immediately followed by a chunk whose
|
|
beginning matches right pattern. It will then merge those two
|
|
chunks into a single chunk.
|
|
"""
|
|
|
|
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
|
"""
|
|
Construct a new ``MergeRule``.
|
|
|
|
:type right_tag_pattern: str
|
|
:param right_tag_pattern: This rule's right tag
|
|
pattern. When applied to a ``ChunkString``, this
|
|
rule will find any chunk whose end matches
|
|
``left_tag_pattern``, and immediately followed by a chunk
|
|
whose beginning matches this pattern. It will
|
|
then merge those two chunks into a single chunk.
|
|
:type left_tag_pattern: str
|
|
:param left_tag_pattern: This rule's left tag
|
|
pattern. When applied to a ``ChunkString``, this
|
|
rule will find any chunk whose end matches
|
|
this pattern, and immediately followed by a chunk
|
|
whose beginning matches ``right_tag_pattern``. It will
|
|
then merge those two chunks into a single chunk.
|
|
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
# Ensure that the individual patterns are coherent. E.g., if
|
|
# left='(' and right=')', then this will raise an exception:
|
|
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
|
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
|
|
|
self._left_tag_pattern = left_tag_pattern
|
|
self._right_tag_pattern = right_tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<left>%s)}{(?=%s)'
|
|
% (
|
|
tag_pattern2re_pattern(left_tag_pattern),
|
|
tag_pattern2re_pattern(right_tag_pattern),
|
|
)
|
|
)
|
|
RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return (
|
|
'<MergeRule: '
|
|
+ unicode_repr(self._left_tag_pattern)
|
|
+ ', '
|
|
+ unicode_repr(self._right_tag_pattern)
|
|
+ '>'
|
|
)
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class SplitRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to split chunks in a ``ChunkString``, using
|
|
two matching tag patterns: a left pattern, and a right pattern.
|
|
When applied to a ``ChunkString``, it will find any chunk that
|
|
matches the left pattern followed by the right pattern. It will
|
|
then split the chunk into two new chunks, at the point between the
|
|
two pattern matches.
|
|
"""
|
|
|
|
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
|
"""
|
|
Construct a new ``SplitRule``.
|
|
|
|
:type right_tag_pattern: str
|
|
:param right_tag_pattern: This rule's right tag
|
|
pattern. When applied to a ``ChunkString``, this rule will
|
|
find any chunk containing a substring that matches
|
|
``left_tag_pattern`` followed by this pattern. It will
|
|
then split the chunk into two new chunks at the point
|
|
between these two matching patterns.
|
|
:type left_tag_pattern: str
|
|
:param left_tag_pattern: This rule's left tag
|
|
pattern. When applied to a ``ChunkString``, this rule will
|
|
find any chunk containing a substring that matches this
|
|
pattern followed by ``right_tag_pattern``. It will then
|
|
split the chunk into two new chunks at the point between
|
|
these two matching patterns.
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
# Ensure that the individual patterns are coherent. E.g., if
|
|
# left='(' and right=')', then this will raise an exception:
|
|
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
|
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
|
|
|
self._left_tag_pattern = left_tag_pattern
|
|
self._right_tag_pattern = right_tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<left>%s)(?=%s)'
|
|
% (
|
|
tag_pattern2re_pattern(left_tag_pattern),
|
|
tag_pattern2re_pattern(right_tag_pattern),
|
|
)
|
|
)
|
|
RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<SplitRule: '<NN>', '<DT>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return (
|
|
'<SplitRule: '
|
|
+ unicode_repr(self._left_tag_pattern)
|
|
+ ', '
|
|
+ unicode_repr(self._right_tag_pattern)
|
|
+ '>'
|
|
)
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class ExpandLeftRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to expand chunks in a ``ChunkString`` to the left,
|
|
using two matching tag patterns: a left pattern, and a right pattern.
|
|
When applied to a ``ChunkString``, it will find any chunk whose beginning
|
|
matches right pattern, and immediately preceded by a chink whose
|
|
end matches left pattern. It will then expand the chunk to incorporate
|
|
the new material on the left.
|
|
"""
|
|
|
|
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
|
"""
|
|
Construct a new ``ExpandRightRule``.
|
|
|
|
:type right_tag_pattern: str
|
|
:param right_tag_pattern: This rule's right tag
|
|
pattern. When applied to a ``ChunkString``, this
|
|
rule will find any chunk whose beginning matches
|
|
``right_tag_pattern``, and immediately preceded by a chink
|
|
whose end matches this pattern. It will
|
|
then merge those two chunks into a single chunk.
|
|
:type left_tag_pattern: str
|
|
:param left_tag_pattern: This rule's left tag
|
|
pattern. When applied to a ``ChunkString``, this
|
|
rule will find any chunk whose beginning matches
|
|
this pattern, and immediately preceded by a chink
|
|
whose end matches ``left_tag_pattern``. It will
|
|
then expand the chunk to incorporate the new material on the left.
|
|
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
# Ensure that the individual patterns are coherent. E.g., if
|
|
# left='(' and right=')', then this will raise an exception:
|
|
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
|
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
|
|
|
self._left_tag_pattern = left_tag_pattern
|
|
self._right_tag_pattern = right_tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<left>%s)\{(?P<right>%s)'
|
|
% (
|
|
tag_pattern2re_pattern(left_tag_pattern),
|
|
tag_pattern2re_pattern(right_tag_pattern),
|
|
)
|
|
)
|
|
RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return (
|
|
'<ExpandLeftRule: '
|
|
+ unicode_repr(self._left_tag_pattern)
|
|
+ ', '
|
|
+ unicode_repr(self._right_tag_pattern)
|
|
+ '>'
|
|
)
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class ExpandRightRule(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to expand chunks in a ``ChunkString`` to the
|
|
right, using two matching tag patterns: a left pattern, and a
|
|
right pattern. When applied to a ``ChunkString``, it will find any
|
|
chunk whose end matches left pattern, and immediately followed by
|
|
a chink whose beginning matches right pattern. It will then
|
|
expand the chunk to incorporate the new material on the right.
|
|
"""
|
|
|
|
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
|
"""
|
|
Construct a new ``ExpandRightRule``.
|
|
|
|
:type right_tag_pattern: str
|
|
:param right_tag_pattern: This rule's right tag
|
|
pattern. When applied to a ``ChunkString``, this
|
|
rule will find any chunk whose end matches
|
|
``left_tag_pattern``, and immediately followed by a chink
|
|
whose beginning matches this pattern. It will
|
|
then merge those two chunks into a single chunk.
|
|
:type left_tag_pattern: str
|
|
:param left_tag_pattern: This rule's left tag
|
|
pattern. When applied to a ``ChunkString``, this
|
|
rule will find any chunk whose end matches
|
|
this pattern, and immediately followed by a chink
|
|
whose beginning matches ``right_tag_pattern``. It will
|
|
then expand the chunk to incorporate the new material on the right.
|
|
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
# Ensure that the individual patterns are coherent. E.g., if
|
|
# left='(' and right=')', then this will raise an exception:
|
|
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
|
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
|
|
|
self._left_tag_pattern = left_tag_pattern
|
|
self._right_tag_pattern = right_tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<left>%s)\}(?P<right>%s)'
|
|
% (
|
|
tag_pattern2re_pattern(left_tag_pattern),
|
|
tag_pattern2re_pattern(right_tag_pattern),
|
|
)
|
|
)
|
|
RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return (
|
|
'<ExpandRightRule: '
|
|
+ unicode_repr(self._left_tag_pattern)
|
|
+ ', '
|
|
+ unicode_repr(self._right_tag_pattern)
|
|
+ '>'
|
|
)
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class ChunkRuleWithContext(RegexpChunkRule):
|
|
"""
|
|
A rule specifying how to add chunks to a ``ChunkString``, using
|
|
three matching tag patterns: one for the left context, one for the
|
|
chunk, and one for the right context. When applied to a
|
|
``ChunkString``, it will find any substring that matches the chunk
|
|
tag pattern, is surrounded by substrings that match the two
|
|
context patterns, and is not already part of a chunk; and create a
|
|
new chunk containing the substring that matched the chunk tag
|
|
pattern.
|
|
|
|
Caveat: Both the left and right context are consumed when this
|
|
rule matches; therefore, if you need to find overlapping matches,
|
|
you will need to apply your rule more than once.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
left_context_tag_pattern,
|
|
chunk_tag_pattern,
|
|
right_context_tag_pattern,
|
|
descr,
|
|
):
|
|
"""
|
|
Construct a new ``ChunkRuleWithContext``.
|
|
|
|
:type left_context_tag_pattern: str
|
|
:param left_context_tag_pattern: A tag pattern that must match
|
|
the left context of ``chunk_tag_pattern`` for this rule to
|
|
apply.
|
|
:type chunk_tag_pattern: str
|
|
:param chunk_tag_pattern: A tag pattern that must match for this
|
|
rule to apply. If the rule does apply, then this pattern
|
|
also identifies the substring that will be made into a chunk.
|
|
:type right_context_tag_pattern: str
|
|
:param right_context_tag_pattern: A tag pattern that must match
|
|
the right context of ``chunk_tag_pattern`` for this rule to
|
|
apply.
|
|
:type descr: str
|
|
:param descr: A short description of the purpose and/or effect
|
|
of this rule.
|
|
"""
|
|
# Ensure that the individual patterns are coherent. E.g., if
|
|
# left='(' and right=')', then this will raise an exception:
|
|
re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
|
|
re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
|
|
re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
|
|
|
|
self._left_context_tag_pattern = left_context_tag_pattern
|
|
self._chunk_tag_pattern = chunk_tag_pattern
|
|
self._right_context_tag_pattern = right_context_tag_pattern
|
|
regexp = re.compile(
|
|
'(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s'
|
|
% (
|
|
tag_pattern2re_pattern(left_context_tag_pattern),
|
|
tag_pattern2re_pattern(chunk_tag_pattern),
|
|
tag_pattern2re_pattern(right_context_tag_pattern),
|
|
ChunkString.IN_CHINK_PATTERN,
|
|
)
|
|
)
|
|
replacement = r'\g<left>{\g<chunk>}\g<right>'
|
|
RegexpChunkRule.__init__(self, regexp, replacement, descr)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Return a string representation of this rule. It has the form::
|
|
|
|
<ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
|
|
|
|
Note that this representation does not include the
|
|
description string; that string can be accessed
|
|
separately with the ``descr()`` method.
|
|
|
|
:rtype: str
|
|
"""
|
|
return '<ChunkRuleWithContext: %r, %r, %r>' % (
|
|
self._left_context_tag_pattern,
|
|
self._chunk_tag_pattern,
|
|
self._right_context_tag_pattern,
|
|
)
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## Tag Pattern Format Conversion
|
|
##//////////////////////////////////////////////////////
|
|
|
|
# this should probably be made more strict than it is -- e.g., it
|
|
# currently accepts 'foo'.
|
|
CHUNK_TAG_PATTERN = re.compile(
|
|
r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
|
|
)
|
|
|
|
|
|
def tag_pattern2re_pattern(tag_pattern):
|
|
"""
|
|
Convert a tag pattern to a regular expression pattern. A "tag
|
|
pattern" is a modified version of a regular expression, designed
|
|
for matching sequences of tags. The differences between regular
|
|
expression patterns and tag patterns are:
|
|
|
|
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
|
|
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
|
|
``'<NN'`` followed by one or more repetitions of ``'>'``.
|
|
- Whitespace in tag patterns is ignored. So
|
|
``'<DT> | <NN>'`` is equivalant to ``'<DT>|<NN>'``
|
|
- In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so
|
|
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
|
|
|
|
In particular, ``tag_pattern2re_pattern`` performs the following
|
|
transformations on the given pattern:
|
|
|
|
- Replace '.' with '[^<>{}]'
|
|
- Remove any whitespace
|
|
- Add extra parens around '<' and '>', to make '<' and '>' act
|
|
like parentheses. E.g., so that in '<NN>+', the '+' has scope
|
|
over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
|
|
scope over 'NN' and 'IN', but not '<' or '>'.
|
|
- Check to make sure the resulting pattern is valid.
|
|
|
|
:type tag_pattern: str
|
|
:param tag_pattern: The tag pattern to convert to a regular
|
|
expression pattern.
|
|
:raise ValueError: If ``tag_pattern`` is not a valid tag pattern.
|
|
In particular, ``tag_pattern`` should not include braces; and it
|
|
should not contain nested or mismatched angle-brackets.
|
|
:rtype: str
|
|
:return: A regular expression pattern corresponding to
|
|
``tag_pattern``.
|
|
"""
|
|
# Clean up the regular expression
|
|
tag_pattern = re.sub(r'\s', '', tag_pattern)
|
|
tag_pattern = re.sub(r'<', '(<(', tag_pattern)
|
|
tag_pattern = re.sub(r'>', ')>)', tag_pattern)
|
|
|
|
# Check the regular expression
|
|
if not CHUNK_TAG_PATTERN.match(tag_pattern):
|
|
raise ValueError('Bad tag pattern: %r' % tag_pattern)
|
|
|
|
# Replace "." with CHUNK_TAG_CHAR.
|
|
# We have to do this after, since it adds {}[]<>s, which would
|
|
# confuse CHUNK_TAG_PATTERN.
|
|
# PRE doesn't have lookback assertions, so reverse twice, and do
|
|
# the pattern backwards (with lookahead assertions). This can be
|
|
# made much cleaner once we can switch back to SRE.
|
|
def reverse_str(str):
|
|
lst = list(str)
|
|
lst.reverse()
|
|
return ''.join(lst)
|
|
|
|
tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
|
|
reversed = reverse_str(tag_pattern)
|
|
reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
|
|
tag_pattern = reverse_str(reversed)
|
|
|
|
return tag_pattern
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## RegexpChunkParser
|
|
##//////////////////////////////////////////////////////
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class RegexpChunkParser(ChunkParserI):
|
|
"""
|
|
A regular expression based chunk parser. ``RegexpChunkParser`` uses a
|
|
sequence of "rules" to find chunks of a single type within a
|
|
text. The chunking of the text is encoded using a ``ChunkString``,
|
|
and each rule acts by modifying the chunking in the
|
|
``ChunkString``. The rules are all implemented using regular
|
|
expression matching and substitution.
|
|
|
|
The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``,
|
|
``ChinkRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``)
|
|
define the rules that are used by ``RegexpChunkParser``. Each rule
|
|
defines an ``apply()`` method, which modifies the chunking encoded
|
|
by a given ``ChunkString``.
|
|
|
|
:type _rules: list(RegexpChunkRule)
|
|
:ivar _rules: The list of rules that should be applied to a text.
|
|
:type _trace: int
|
|
:ivar _trace: The default level of tracing.
|
|
|
|
"""
|
|
|
|
def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
|
|
"""
|
|
Construct a new ``RegexpChunkParser``.
|
|
|
|
:type rules: list(RegexpChunkRule)
|
|
:param rules: The sequence of rules that should be used to
|
|
generate the chunking for a tagged text.
|
|
:type chunk_label: str
|
|
:param chunk_label: The node value that should be used for
|
|
chunk subtrees. This is typically a short string
|
|
describing the type of information contained by the chunk,
|
|
such as ``"NP"`` for base noun phrases.
|
|
:type root_label: str
|
|
:param root_label: The node value that should be used for the
|
|
top node of the chunk structure.
|
|
:type trace: int
|
|
:param trace: The level of tracing that should be used when
|
|
parsing a text. ``0`` will generate no tracing output;
|
|
``1`` will generate normal tracing output; and ``2`` or
|
|
higher will generate verbose tracing output.
|
|
"""
|
|
self._rules = rules
|
|
self._trace = trace
|
|
self._chunk_label = chunk_label
|
|
self._root_label = root_label
|
|
|
|
def _trace_apply(self, chunkstr, verbose):
|
|
"""
|
|
Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
|
|
turn. Generate trace output between each rule. If ``verbose``
|
|
is true, then generate verbose output.
|
|
|
|
:type chunkstr: ChunkString
|
|
:param chunkstr: The chunk string to which each rule should be
|
|
applied.
|
|
:type verbose: bool
|
|
:param verbose: Whether output should be verbose.
|
|
:rtype: None
|
|
"""
|
|
print('# Input:')
|
|
print(chunkstr)
|
|
for rule in self._rules:
|
|
rule.apply(chunkstr)
|
|
if verbose:
|
|
print('#', rule.descr() + ' (' + unicode_repr(rule) + '):')
|
|
else:
|
|
print('#', rule.descr() + ':')
|
|
print(chunkstr)
|
|
|
|
def _notrace_apply(self, chunkstr):
|
|
"""
|
|
Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
|
|
turn.
|
|
|
|
:param chunkstr: The chunk string to which each rule should be
|
|
applied.
|
|
:type chunkstr: ChunkString
|
|
:rtype: None
|
|
"""
|
|
|
|
for rule in self._rules:
|
|
rule.apply(chunkstr)
|
|
|
|
def parse(self, chunk_struct, trace=None):
|
|
"""
|
|
:type chunk_struct: Tree
|
|
:param chunk_struct: the chunk structure to be (further) chunked
|
|
:type trace: int
|
|
:param trace: The level of tracing that should be used when
|
|
parsing a text. ``0`` will generate no tracing output;
|
|
``1`` will generate normal tracing output; and ``2`` or
|
|
highter will generate verbose tracing output. This value
|
|
overrides the trace level value that was given to the
|
|
constructor.
|
|
:rtype: Tree
|
|
:return: a chunk structure that encodes the chunks in a given
|
|
tagged sentence. A chunk is a non-overlapping linguistic
|
|
group, such as a noun phrase. The set of chunks
|
|
identified in the chunk structure depends on the rules
|
|
used to define this ``RegexpChunkParser``.
|
|
"""
|
|
if len(chunk_struct) == 0:
|
|
print('Warning: parsing empty text')
|
|
return Tree(self._root_label, [])
|
|
|
|
try:
|
|
chunk_struct.label()
|
|
except AttributeError:
|
|
chunk_struct = Tree(self._root_label, chunk_struct)
|
|
|
|
# Use the default trace value?
|
|
if trace is None:
|
|
trace = self._trace
|
|
|
|
chunkstr = ChunkString(chunk_struct)
|
|
|
|
# Apply the sequence of rules to the chunkstring.
|
|
if trace:
|
|
verbose = trace > 1
|
|
self._trace_apply(chunkstr, verbose)
|
|
else:
|
|
self._notrace_apply(chunkstr)
|
|
|
|
# Use the chunkstring to create a chunk structure.
|
|
return chunkstr.to_chunkstruct(self._chunk_label)
|
|
|
|
def rules(self):
|
|
"""
|
|
:return: the sequence of rules used by ``RegexpChunkParser``.
|
|
:rtype: list(RegexpChunkRule)
|
|
"""
|
|
return self._rules
|
|
|
|
def __repr__(self):
|
|
"""
|
|
:return: a concise string representation of this
|
|
``RegexpChunkParser``.
|
|
:rtype: str
|
|
"""
|
|
return "<RegexpChunkParser with %d rules>" % len(self._rules)
|
|
|
|
def __str__(self):
|
|
"""
|
|
:return: a verbose string representation of this ``RegexpChunkParser``.
|
|
:rtype: str
|
|
"""
|
|
s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
|
|
margin = 0
|
|
for rule in self._rules:
|
|
margin = max(margin, len(rule.descr()))
|
|
if margin < 35:
|
|
format = " %" + repr(-(margin + 3)) + "s%s\n"
|
|
else:
|
|
format = " %s\n %s\n"
|
|
for rule in self._rules:
|
|
s += format % (rule.descr(), unicode_repr(rule))
|
|
return s[:-1]
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## Chunk Grammar
|
|
##//////////////////////////////////////////////////////
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class RegexpParser(ChunkParserI):
|
|
"""
|
|
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
|
|
regular expression patterns to specify the behavior of the parser.
|
|
The chunking of the text is encoded using a ``ChunkString``, and
|
|
each rule acts by modifying the chunking in the ``ChunkString``.
|
|
The rules are all implemented using regular expression matching
|
|
and substitution.
|
|
|
|
A grammar contains one or more clauses in the following form::
|
|
|
|
NP:
|
|
{<DT|JJ>} # chunk determiners and adjectives
|
|
}<[\.VI].*>+{ # chink any tag beginning with V, I, or .
|
|
<.*>}{<DT> # split a chunk at a determiner
|
|
<DT|JJ>{}<NN.*> # merge chunk ending with det/adj
|
|
# with one starting with a noun
|
|
|
|
The patterns of a clause are executed in order. An earlier
|
|
pattern may introduce a chunk boundary that prevents a later
|
|
pattern from executing. Sometimes an individual pattern will
|
|
match on multiple, overlapping extents of the input. As with
|
|
regular expression substitution more generally, the chunker will
|
|
identify the first match possible, then continue looking for matches
|
|
after this one has ended.
|
|
|
|
The clauses of a grammar are also executed in order. A cascaded
|
|
chunk parser is one having more than one clause. The maximum depth
|
|
of a parse tree created by this chunk parser is the same as the
|
|
number of clauses in the grammar.
|
|
|
|
When tracing is turned on, the comment portion of a line is displayed
|
|
each time the corresponding pattern is applied.
|
|
|
|
:type _start: str
|
|
:ivar _start: The start symbol of the grammar (the root node of
|
|
resulting trees)
|
|
:type _stages: int
|
|
:ivar _stages: The list of parsing stages corresponding to the grammar
|
|
|
|
"""
|
|
|
|
def __init__(self, grammar, root_label='S', loop=1, trace=0):
|
|
"""
|
|
Create a new chunk parser, from the given start state
|
|
and set of chunk patterns.
|
|
|
|
:param grammar: The grammar, or a list of RegexpChunkParser objects
|
|
:type grammar: str or list(RegexpChunkParser)
|
|
:param root_label: The top node of the tree being created
|
|
:type root_label: str or Nonterminal
|
|
:param loop: The number of times to run through the patterns
|
|
:type loop: int
|
|
:type trace: int
|
|
:param trace: The level of tracing that should be used when
|
|
parsing a text. ``0`` will generate no tracing output;
|
|
``1`` will generate normal tracing output; and ``2`` or
|
|
higher will generate verbose tracing output.
|
|
"""
|
|
self._trace = trace
|
|
self._stages = []
|
|
self._grammar = grammar
|
|
self._loop = loop
|
|
|
|
if isinstance(grammar, string_types):
|
|
self._read_grammar(grammar, root_label, trace)
|
|
else:
|
|
# Make sur the grammar looks like it has the right type:
|
|
type_err = (
|
|
'Expected string or list of RegexpChunkParsers ' 'for the grammar.'
|
|
)
|
|
try:
|
|
grammar = list(grammar)
|
|
except:
|
|
raise TypeError(type_err)
|
|
for elt in grammar:
|
|
if not isinstance(elt, RegexpChunkParser):
|
|
raise TypeError(type_err)
|
|
self._stages = grammar
|
|
|
|
def _read_grammar(self, grammar, root_label, trace):
|
|
"""
|
|
Helper function for __init__: read the grammar if it is a
|
|
string.
|
|
"""
|
|
rules = []
|
|
lhs = None
|
|
for line in grammar.split('\n'):
|
|
line = line.strip()
|
|
|
|
# New stage begins if there's an unescaped ':'
|
|
m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
|
|
if m:
|
|
# Record the stage that we just completed.
|
|
self._add_stage(rules, lhs, root_label, trace)
|
|
# Start a new stage.
|
|
lhs = m.group('nonterminal').strip()
|
|
rules = []
|
|
line = m.group('rule').strip()
|
|
|
|
# Skip blank & comment-only lines
|
|
if line == '' or line.startswith('#'):
|
|
continue
|
|
|
|
# Add the rule
|
|
rules.append(RegexpChunkRule.fromstring(line))
|
|
|
|
# Record the final stage
|
|
self._add_stage(rules, lhs, root_label, trace)
|
|
|
|
def _add_stage(self, rules, lhs, root_label, trace):
|
|
"""
|
|
Helper function for __init__: add a new stage to the parser.
|
|
"""
|
|
if rules != []:
|
|
if not lhs:
|
|
raise ValueError('Expected stage marker (eg NP:)')
|
|
parser = RegexpChunkParser(
|
|
rules, chunk_label=lhs, root_label=root_label, trace=trace
|
|
)
|
|
self._stages.append(parser)
|
|
|
|
def parse(self, chunk_struct, trace=None):
|
|
"""
|
|
Apply the chunk parser to this input.
|
|
|
|
:type chunk_struct: Tree
|
|
:param chunk_struct: the chunk structure to be (further) chunked
|
|
(this tree is modified, and is also returned)
|
|
:type trace: int
|
|
:param trace: The level of tracing that should be used when
|
|
parsing a text. ``0`` will generate no tracing output;
|
|
``1`` will generate normal tracing output; and ``2`` or
|
|
highter will generate verbose tracing output. This value
|
|
overrides the trace level value that was given to the
|
|
constructor.
|
|
:return: the chunked output.
|
|
:rtype: Tree
|
|
"""
|
|
if trace is None:
|
|
trace = self._trace
|
|
for i in range(self._loop):
|
|
for parser in self._stages:
|
|
chunk_struct = parser.parse(chunk_struct, trace=trace)
|
|
return chunk_struct
|
|
|
|
def __repr__(self):
|
|
"""
|
|
:return: a concise string representation of this ``chunk.RegexpParser``.
|
|
:rtype: str
|
|
"""
|
|
return "<chunk.RegexpParser with %d stages>" % len(self._stages)
|
|
|
|
def __str__(self):
|
|
"""
|
|
:return: a verbose string representation of this
|
|
``RegexpParser``.
|
|
:rtype: str
|
|
"""
|
|
s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
|
|
margin = 0
|
|
for parser in self._stages:
|
|
s += "%s\n" % parser
|
|
return s[:-1]
|
|
|
|
|
|
##//////////////////////////////////////////////////////
|
|
## Demonstration code
|
|
##//////////////////////////////////////////////////////
|
|
|
|
|
|
def demo_eval(chunkparser, text):
|
|
"""
|
|
Demonstration code for evaluating a chunk parser, using a
|
|
``ChunkScore``. This function assumes that ``text`` contains one
|
|
sentence per line, and that each sentence has the form expected by
|
|
``tree.chunk``. It runs the given chunk parser on each sentence in
|
|
the text, and scores the result. It prints the final score
|
|
(precision, recall, and f-measure); and reports the set of chunks
|
|
that were missed and the set of chunks that were incorrect. (At
|
|
most 10 missing chunks and 10 incorrect chunks are reported).
|
|
|
|
:param chunkparser: The chunkparser to be tested
|
|
:type chunkparser: ChunkParserI
|
|
:param text: The chunked tagged text that should be used for
|
|
evaluation.
|
|
:type text: str
|
|
"""
|
|
from nltk import chunk
|
|
from nltk.tree import Tree
|
|
|
|
# Evaluate our chunk parser.
|
|
chunkscore = chunk.ChunkScore()
|
|
|
|
for sentence in text.split('\n'):
|
|
print(sentence)
|
|
sentence = sentence.strip()
|
|
if not sentence:
|
|
continue
|
|
gold = chunk.tagstr2tree(sentence)
|
|
tokens = gold.leaves()
|
|
test = chunkparser.parse(Tree('S', tokens), trace=1)
|
|
chunkscore.score(gold, test)
|
|
print()
|
|
|
|
print('/' + ('=' * 75) + '\\')
|
|
print('Scoring', chunkparser)
|
|
print(('-' * 77))
|
|
print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ')
|
|
print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ')
|
|
print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100))
|
|
|
|
# Missed chunks.
|
|
if chunkscore.missed():
|
|
print('Missed:')
|
|
missed = chunkscore.missed()
|
|
for chunk in missed[:10]:
|
|
print(' ', ' '.join(map(str, chunk)))
|
|
if len(chunkscore.missed()) > 10:
|
|
print(' ...')
|
|
|
|
# Incorrect chunks.
|
|
if chunkscore.incorrect():
|
|
print('Incorrect:')
|
|
incorrect = chunkscore.incorrect()
|
|
for chunk in incorrect[:10]:
|
|
print(' ', ' '.join(map(str, chunk)))
|
|
if len(chunkscore.incorrect()) > 10:
|
|
print(' ...')
|
|
|
|
print('\\' + ('=' * 75) + '/')
|
|
print()
|
|
|
|
|
|
def demo():
|
|
"""
|
|
A demonstration for the ``RegexpChunkParser`` class. A single text is
|
|
parsed with four different chunk parsers, using a variety of rules
|
|
and strategies.
|
|
"""
|
|
|
|
from nltk import chunk, Tree
|
|
|
|
text = """\
|
|
[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
|
|
[ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
|
|
[ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
|
|
"""
|
|
|
|
print('*' * 75)
|
|
print('Evaluation text:')
|
|
print(text)
|
|
print('*' * 75)
|
|
print()
|
|
|
|
grammar = r"""
|
|
NP: # NP stage
|
|
{<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
|
|
{<NNP>+} # chunk proper nouns
|
|
"""
|
|
cp = chunk.RegexpParser(grammar)
|
|
demo_eval(cp, text)
|
|
|
|
grammar = r"""
|
|
NP:
|
|
{<.*>} # start by chunking each tag
|
|
}<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
|
|
<DT|JJ>{}<NN.*> # merge det/adj with nouns
|
|
"""
|
|
cp = chunk.RegexpParser(grammar)
|
|
demo_eval(cp, text)
|
|
|
|
grammar = r"""
|
|
NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
|
|
VP: {<TO>?<VB.*>} # VP = verb words
|
|
"""
|
|
cp = chunk.RegexpParser(grammar)
|
|
demo_eval(cp, text)
|
|
|
|
grammar = r"""
|
|
NP: {<.*>*} # start by chunking everything
|
|
}<[\.VI].*>+{ # chink any verbs, prepositions or periods
|
|
<.*>}{<DT> # separate on determiners
|
|
PP: {<IN><NP>} # PP = preposition + noun phrase
|
|
VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
|
|
"""
|
|
cp = chunk.RegexpParser(grammar)
|
|
demo_eval(cp, text)
|
|
|
|
# Evaluation
|
|
|
|
from nltk.corpus import conll2000
|
|
|
|
print()
|
|
print("Demonstration of empty grammar:")
|
|
|
|
cp = chunk.RegexpParser("")
|
|
print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))))
|
|
|
|
print()
|
|
print("Demonstration of accuracy evaluation using CoNLL tags:")
|
|
|
|
grammar = r"""
|
|
NP:
|
|
{<.*>} # start by chunking each tag
|
|
}<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
|
|
<DT|JJ>{}<NN.*> # merge det/adj with nouns
|
|
"""
|
|
cp = chunk.RegexpParser(grammar)
|
|
print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
|
|
|
|
print()
|
|
print("Demonstration of tagged token input")
|
|
|
|
grammar = r"""
|
|
NP: {<.*>*} # start by chunking everything
|
|
}<[\.VI].*>+{ # chink any verbs, prepositions or periods
|
|
<.*>}{<DT> # separate on determiners
|
|
PP: {<IN><NP>} # PP = preposition + noun phrase
|
|
VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
|
|
"""
|
|
cp = chunk.RegexpParser(grammar)
|
|
print(
|
|
cp.parse(
|
|
[
|
|
("the", "DT"),
|
|
("little", "JJ"),
|
|
("cat", "NN"),
|
|
("sat", "VBD"),
|
|
("on", "IN"),
|
|
("the", "DT"),
|
|
("mat", "NN"),
|
|
(".", "."),
|
|
]
|
|
)
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
demo()
|