| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586 |
- # -*- coding: utf-8 -*-
- import unittest
- from nltk import RegexpParser
- class TestChunkRule(unittest.TestCase):
- def test_tag_pattern2re_pattern_quantifier(self):
- """Test for bug https://github.com/nltk/nltk/issues/1597
- Ensures that curly bracket quantifiers can be used inside a chunk rule.
- This type of quantifier has been used for the supplementary example
- in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
- """
- sent = [
- ('The', 'AT'),
- ('September-October', 'NP'),
- ('term', 'NN'),
- ('jury', 'NN'),
- ('had', 'HVD'),
- ('been', 'BEN'),
- ('charged', 'VBN'),
- ('by', 'IN'),
- ('Fulton', 'NP-TL'),
- ('Superior', 'JJ-TL'),
- ('Court', 'NN-TL'),
- ('Judge', 'NN-TL'),
- ('Durwood', 'NP'),
- ('Pye', 'NP'),
- ('to', 'TO'),
- ('investigate', 'VB'),
- ('reports', 'NNS'),
- ('of', 'IN'),
- ('possible', 'JJ'),
- ('``', '``'),
- ('irregularities', 'NNS'),
- ("''", "''"),
- ('in', 'IN'),
- ('the', 'AT'),
- ('hard-fought', 'JJ'),
- ('primary', 'NN'),
- ('which', 'WDT'),
- ('was', 'BEDZ'),
- ('won', 'VBN'),
- ('by', 'IN'),
- ('Mayor-nominate', 'NN-TL'),
- ('Ivan', 'NP'),
- ('Allen', 'NP'),
- ('Jr.', 'NP'),
- ('.', '.'),
- ] # source: brown corpus
- cp = RegexpParser('CHUNK: {<N.*>{4,}}')
- tree = cp.parse(sent)
- assert (
- tree.pformat()
- == """(S
- The/AT
- September-October/NP
- term/NN
- jury/NN
- had/HVD
- been/BEN
- charged/VBN
- by/IN
- Fulton/NP-TL
- Superior/JJ-TL
- (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
- to/TO
- investigate/VB
- reports/NNS
- of/IN
- possible/JJ
- ``/``
- irregularities/NNS
- ''/''
- in/IN
- the/AT
- hard-fought/JJ
- primary/NN
- which/WDT
- was/BEDZ
- won/VBN
- by/IN
- (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
- ./.)"""
- )
|