You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
87 lines
2.1 KiB
Python
87 lines
2.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
import unittest
|
|
|
|
from nltk import RegexpParser
|
|
|
|
|
|
class TestChunkRule(unittest.TestCase):
|
|
def test_tag_pattern2re_pattern_quantifier(self):
|
|
"""Test for bug https://github.com/nltk/nltk/issues/1597
|
|
|
|
Ensures that curly bracket quantifiers can be used inside a chunk rule.
|
|
This type of quantifier has been used for the supplementary example
|
|
in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
|
|
"""
|
|
sent = [
|
|
('The', 'AT'),
|
|
('September-October', 'NP'),
|
|
('term', 'NN'),
|
|
('jury', 'NN'),
|
|
('had', 'HVD'),
|
|
('been', 'BEN'),
|
|
('charged', 'VBN'),
|
|
('by', 'IN'),
|
|
('Fulton', 'NP-TL'),
|
|
('Superior', 'JJ-TL'),
|
|
('Court', 'NN-TL'),
|
|
('Judge', 'NN-TL'),
|
|
('Durwood', 'NP'),
|
|
('Pye', 'NP'),
|
|
('to', 'TO'),
|
|
('investigate', 'VB'),
|
|
('reports', 'NNS'),
|
|
('of', 'IN'),
|
|
('possible', 'JJ'),
|
|
('``', '``'),
|
|
('irregularities', 'NNS'),
|
|
("''", "''"),
|
|
('in', 'IN'),
|
|
('the', 'AT'),
|
|
('hard-fought', 'JJ'),
|
|
('primary', 'NN'),
|
|
('which', 'WDT'),
|
|
('was', 'BEDZ'),
|
|
('won', 'VBN'),
|
|
('by', 'IN'),
|
|
('Mayor-nominate', 'NN-TL'),
|
|
('Ivan', 'NP'),
|
|
('Allen', 'NP'),
|
|
('Jr.', 'NP'),
|
|
('.', '.'),
|
|
] # source: brown corpus
|
|
cp = RegexpParser('CHUNK: {<N.*>{4,}}')
|
|
tree = cp.parse(sent)
|
|
assert (
|
|
tree.pformat()
|
|
== """(S
|
|
The/AT
|
|
September-October/NP
|
|
term/NN
|
|
jury/NN
|
|
had/HVD
|
|
been/BEN
|
|
charged/VBN
|
|
by/IN
|
|
Fulton/NP-TL
|
|
Superior/JJ-TL
|
|
(CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
|
|
to/TO
|
|
investigate/VB
|
|
reports/NNS
|
|
of/IN
|
|
possible/JJ
|
|
``/``
|
|
irregularities/NNS
|
|
''/''
|
|
in/IN
|
|
the/AT
|
|
hard-fought/JJ
|
|
primary/NN
|
|
which/WDT
|
|
was/BEDZ
|
|
won/VBN
|
|
by/IN
|
|
(CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
|
|
./.)"""
|
|
)
|