You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

374 lines
11 KiB
Plaintext

.. Copyright (C) 2001-2020 NLTK Project
.. For license information, see LICENSE.TXT
==========
Chunking
==========
>>> from nltk.chunk import *
>>> from nltk.chunk.util import *
>>> from nltk.chunk.regexp import *
>>> from nltk import Tree
>>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./."
>>> gold_chunked_text = tagstr2tree(tagged_text)
>>> unchunked_text = gold_chunked_text.flatten()
Chunking uses a special regexp syntax for rules that delimit the chunks. These
rules must be converted to 'regular' regular expressions before a sentence can
be chunked.
>>> tag_pattern = "<DT>?<JJ>*<NN.*>"
>>> regexp_pattern = tag_pattern2re_pattern(tag_pattern)
>>> regexp_pattern
'(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'
Construct some new chunking rules.
>>> chunk_rule = ChunkRule("<.*>+", "Chunk everything")
>>> chink_rule = ChinkRule("<VBD|IN|\.>", "Chink on verbs/prepositions")
>>> split_rule = SplitRule("<DT><NN>", "<DT><NN>",
... "Split successive determiner/noun pairs")
Create and score a series of chunk parsers, successively more complex.
>>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
>>> chunked_text = chunk_parser.parse(unchunked_text)
>>> print(chunked_text)
(S
(NP
The/DT
cat/NN
sat/VBD
on/IN
the/DT
mat/NN
the/DT
dog/NN
chewed/VBD
./.))
>>> chunkscore = ChunkScore()
>>> chunkscore.score(gold_chunked_text, chunked_text)
>>> print(chunkscore.precision())
0.0
>>> print(chunkscore.recall())
0.0
>>> print(chunkscore.f_measure())
0
>>> for chunk in sorted(chunkscore.missed()): print(chunk)
(NP The/DT cat/NN)
(NP the/DT dog/NN)
(NP the/DT mat/NN)
>>> for chunk in chunkscore.incorrect(): print(chunk)
(NP
The/DT
cat/NN
sat/VBD
on/IN
the/DT
mat/NN
the/DT
dog/NN
chewed/VBD
./.)
>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule],
... chunk_label='NP')
>>> chunked_text = chunk_parser.parse(unchunked_text)
>>> print(chunked_text)
(S
(NP The/DT cat/NN)
sat/VBD
on/IN
(NP the/DT mat/NN the/DT dog/NN)
chewed/VBD
./.)
>>> assert chunked_text == chunk_parser.parse(list(unchunked_text))
>>> chunkscore = ChunkScore()
>>> chunkscore.score(gold_chunked_text, chunked_text)
>>> chunkscore.precision()
0.5
>>> print(chunkscore.recall())
0.33333333...
>>> print(chunkscore.f_measure())
0.4
>>> for chunk in sorted(chunkscore.missed()): print(chunk)
(NP the/DT dog/NN)
(NP the/DT mat/NN)
>>> for chunk in chunkscore.incorrect(): print(chunk)
(NP the/DT mat/NN the/DT dog/NN)
>>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule],
... chunk_label='NP')
>>> chunked_text = chunk_parser.parse(unchunked_text, trace=True)
# Input:
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
# Chunk everything:
{<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>}
# Chink on verbs/prepositions:
{<DT> <NN>} <VBD> <IN> {<DT> <NN> <DT> <NN>} <VBD> <.>
# Split successive determiner/noun pairs:
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
>>> print(chunked_text)
(S
(NP The/DT cat/NN)
sat/VBD
on/IN
(NP the/DT mat/NN)
(NP the/DT dog/NN)
chewed/VBD
./.)
>>> chunkscore = ChunkScore()
>>> chunkscore.score(gold_chunked_text, chunked_text)
>>> chunkscore.precision()
1.0
>>> chunkscore.recall()
1.0
>>> chunkscore.f_measure()
1.0
>>> chunkscore.missed()
[]
>>> chunkscore.incorrect()
[]
>>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE
[<ChunkRule: '<.*>+'>, <ChinkRule: '<VBD|IN|\\.>'>,
<SplitRule: '<DT><NN>', '<DT><NN>'>]
Printing parsers:
>>> print(repr(chunk_parser))
<RegexpChunkParser with 3 rules>
>>> print(chunk_parser)
RegexpChunkParser with 3 rules:
Chunk everything
<ChunkRule: '<.*>+'>
Chink on verbs/prepositions
<ChinkRule: '<VBD|IN|\\.>'>
Split successive determiner/noun pairs
<SplitRule: '<DT><NN>', '<DT><NN>'>
Regression Tests
~~~~~~~~~~~~~~~~
ChunkParserI
------------
`ChunkParserI` is an abstract interface -- it is not meant to be
instantiated directly.
>>> ChunkParserI().parse([])
Traceback (most recent call last):
. . .
NotImplementedError
ChunkString
-----------
ChunkString can be built from a tree of tagged tuples, a tree of
trees, or a mixed list of both:
>>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)])
>>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])])
>>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])])
>>> ChunkString(t1)
<ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'>
>>> ChunkString(t2)
<ChunkString: '<t0><t1>'>
>>> ChunkString(t3)
<ChunkString: '<t0><t1>'>
Other values generate an error:
>>> ChunkString(Tree('S', ['x']))
Traceback (most recent call last):
. . .
ValueError: chunk structures must contain tagged tokens or trees
The `str()` for a chunk string adds spaces to it, which makes it line
up with `str()` output for other chunk strings over the same
underlying input.
>>> cs = ChunkString(t1)
>>> print(cs)
<t0> <t1> <t2> <t3> <t4> <t5> <t6> <t7> <t8> <t9>
>>> cs.xform('<t3>', '{<t3>}')
>>> print(cs)
<t0> <t1> <t2> {<t3>} <t4> <t5> <t6> <t7> <t8> <t9>
The `_verify()` method makes sure that our transforms don't corrupt
the chunk string. By setting debug_level=2, `_verify()` will be
called at the end of every call to `xform`.
>>> cs = ChunkString(t1, debug_level=3)
>>> # tag not marked with <...>:
>>> cs.xform('<t3>', 't3')
Traceback (most recent call last):
. . .
ValueError: Transformation generated invalid chunkstring:
<t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>
>>> # brackets not balanced:
>>> cs.xform('<t3>', '{<t3>')
Traceback (most recent call last):
. . .
ValueError: Transformation generated invalid chunkstring:
<t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>
>>> # nested brackets:
>>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}')
Traceback (most recent call last):
. . .
ValueError: Transformation generated invalid chunkstring:
<t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>
>>> # modified tags:
>>> cs.xform('<t3>', '<t9>')
Traceback (most recent call last):
. . .
ValueError: Transformation generated invalid chunkstring: tag changed
>>> # added tags:
>>> cs.xform('<t9>', '<t9><t10>')
Traceback (most recent call last):
. . .
ValueError: Transformation generated invalid chunkstring: tag changed
Chunking Rules
--------------
Test the different rule constructors & __repr__ methods:
>>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_CHINK_PATTERN,
... '{<a|b>}', 'chunk <a> and <b>')
>>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_CHINK_PATTERN),
... '{<a|b>}', 'chunk <a> and <b>')
>>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>')
>>> r4 = ChinkRule('<a|b>', 'chink <a> and <b>')
>>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>')
>>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>')
>>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>')
>>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>')
>>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>')
>>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9:
... print(rule)
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
<ChunkRule: '<a|b>'>
<ChinkRule: '<a|b>'>
<UnChunkRule: '<a|b>'>
<MergeRule: '<a>', '<b>'>
<SplitRule: '<a>', '<b>'>
<ExpandLeftRule: '<a>', '<b>'>
<ExpandRightRule: '<a>', '<b>'>
`tag_pattern2re_pattern()` complains if the tag pattern looks problematic:
>>> tag_pattern2re_pattern('{}')
Traceback (most recent call last):
. . .
ValueError: Bad tag pattern: '{}'
RegexpChunkParser
-----------------
A warning is printed when parsing an empty sentence:
>>> parser = RegexpChunkParser([ChunkRule('<a>', '')])
>>> parser.parse(Tree('S', []))
Warning: parsing empty text
Tree('S', [])
RegexpParser
------------
>>> parser = RegexpParser('''
... NP: {<DT>? <JJ>* <NN>*} # NP
... P: {<IN>} # Preposition
... V: {<V.*>} # Verb
... PP: {<P> <NP>} # PP -> P NP
... VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*
... ''')
>>> print(repr(parser))
<chunk.RegexpParser with 5 stages>
>>> print(parser)
chunk.RegexpParser with 5 stages:
RegexpChunkParser with 1 rules:
NP <ChunkRule: '<DT>? <JJ>* <NN>*'>
RegexpChunkParser with 1 rules:
Preposition <ChunkRule: '<IN>'>
RegexpChunkParser with 1 rules:
Verb <ChunkRule: '<V.*>'>
RegexpChunkParser with 1 rules:
PP -> P NP <ChunkRule: '<P> <NP>'>
RegexpChunkParser with 1 rules:
VP -> V (NP|PP)* <ChunkRule: '<V> <NP|PP>*'>
>>> print(parser.parse(unchunked_text, trace=True))
# Input:
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
# NP:
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
# Input:
<NP> <VBD> <IN> <NP> <NP> <VBD> <.>
# Preposition:
<NP> <VBD> {<IN>} <NP> <NP> <VBD> <.>
# Input:
<NP> <VBD> <P> <NP> <NP> <VBD> <.>
# Verb:
<NP> {<VBD>} <P> <NP> <NP> {<VBD>} <.>
# Input:
<NP> <V> <P> <NP> <NP> <V> <.>
# PP -> P NP:
<NP> <V> {<P> <NP>} <NP> <V> <.>
# Input:
<NP> <V> <PP> <NP> <V> <.>
# VP -> V (NP|PP)*:
<NP> {<V> <PP> <NP>}{<V>} <.>
(S
(NP The/DT cat/NN)
(VP
(V sat/VBD)
(PP (P on/IN) (NP the/DT mat/NN))
(NP the/DT dog/NN))
(VP (V chewed/VBD))
./.)
Test parsing of other rule types:
>>> print(RegexpParser('''
... X:
... }<a><b>{ # chink rule
... <a>}{<b> # split rule
... <a>{}<b> # merge rule
... <a>{<b>}<c> # chunk rule w/ context
... '''))
chunk.RegexpParser with 1 stages:
RegexpChunkParser with 4 rules:
chink rule <ChinkRule: '<a><b>'>
split rule <SplitRule: '<a>', '<b>'>
merge rule <MergeRule: '<a>', '<b>'>
chunk rule w/ context <ChunkRuleWithContext: '<a>', '<b>', '<c>'>
Illegal patterns give an error message:
>>> print(RegexpParser('X: {<foo>} {<bar>}'))
Traceback (most recent call last):
. . .
ValueError: Illegal chunk pattern: {<foo>} {<bar>}