You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.1 KiB
Python

# -*- coding: utf-8 -*-
import unittest
from nltk import RegexpParser
class TestChunkRule(unittest.TestCase):
def test_tag_pattern2re_pattern_quantifier(self):
"""Test for bug https://github.com/nltk/nltk/issues/1597
Ensures that curly bracket quantifiers can be used inside a chunk rule.
This type of quantifier has been used for the supplementary example
in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
"""
sent = [
('The', 'AT'),
('September-October', 'NP'),
('term', 'NN'),
('jury', 'NN'),
('had', 'HVD'),
('been', 'BEN'),
('charged', 'VBN'),
('by', 'IN'),
('Fulton', 'NP-TL'),
('Superior', 'JJ-TL'),
('Court', 'NN-TL'),
('Judge', 'NN-TL'),
('Durwood', 'NP'),
('Pye', 'NP'),
('to', 'TO'),
('investigate', 'VB'),
('reports', 'NNS'),
('of', 'IN'),
('possible', 'JJ'),
('``', '``'),
('irregularities', 'NNS'),
("''", "''"),
('in', 'IN'),
('the', 'AT'),
('hard-fought', 'JJ'),
('primary', 'NN'),
('which', 'WDT'),
('was', 'BEDZ'),
('won', 'VBN'),
('by', 'IN'),
('Mayor-nominate', 'NN-TL'),
('Ivan', 'NP'),
('Allen', 'NP'),
('Jr.', 'NP'),
('.', '.'),
] # source: brown corpus
cp = RegexpParser('CHUNK: {<N.*>{4,}}')
tree = cp.parse(sent)
assert (
tree.pformat()
== """(S
The/AT
September-October/NP
term/NN
jury/NN
had/HVD
been/BEN
charged/VBN
by/IN
Fulton/NP-TL
Superior/JJ-TL
(CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
to/TO
investigate/VB
reports/NNS
of/IN
possible/JJ
``/``
irregularities/NNS
''/''
in/IN
the/AT
hard-fought/JJ
primary/NN
which/WDT
was/BEDZ
won/VBN
by/IN
(CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
./.)"""
)