test_chunk.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # -*- coding: utf-8 -*-
  2. import unittest
  3. from nltk import RegexpParser
  4. class TestChunkRule(unittest.TestCase):
  5. def test_tag_pattern2re_pattern_quantifier(self):
  6. """Test for bug https://github.com/nltk/nltk/issues/1597
  7. Ensures that curly bracket quantifiers can be used inside a chunk rule.
  8. This type of quantifier has been used for the supplementary example
  9. in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
  10. """
  11. sent = [
  12. ('The', 'AT'),
  13. ('September-October', 'NP'),
  14. ('term', 'NN'),
  15. ('jury', 'NN'),
  16. ('had', 'HVD'),
  17. ('been', 'BEN'),
  18. ('charged', 'VBN'),
  19. ('by', 'IN'),
  20. ('Fulton', 'NP-TL'),
  21. ('Superior', 'JJ-TL'),
  22. ('Court', 'NN-TL'),
  23. ('Judge', 'NN-TL'),
  24. ('Durwood', 'NP'),
  25. ('Pye', 'NP'),
  26. ('to', 'TO'),
  27. ('investigate', 'VB'),
  28. ('reports', 'NNS'),
  29. ('of', 'IN'),
  30. ('possible', 'JJ'),
  31. ('``', '``'),
  32. ('irregularities', 'NNS'),
  33. ("''", "''"),
  34. ('in', 'IN'),
  35. ('the', 'AT'),
  36. ('hard-fought', 'JJ'),
  37. ('primary', 'NN'),
  38. ('which', 'WDT'),
  39. ('was', 'BEDZ'),
  40. ('won', 'VBN'),
  41. ('by', 'IN'),
  42. ('Mayor-nominate', 'NN-TL'),
  43. ('Ivan', 'NP'),
  44. ('Allen', 'NP'),
  45. ('Jr.', 'NP'),
  46. ('.', '.'),
  47. ] # source: brown corpus
  48. cp = RegexpParser('CHUNK: {<N.*>{4,}}')
  49. tree = cp.parse(sent)
  50. assert (
  51. tree.pformat()
  52. == """(S
  53. The/AT
  54. September-October/NP
  55. term/NN
  56. jury/NN
  57. had/HVD
  58. been/BEN
  59. charged/VBN
  60. by/IN
  61. Fulton/NP-TL
  62. Superior/JJ-TL
  63. (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
  64. to/TO
  65. investigate/VB
  66. reports/NNS
  67. of/IN
  68. possible/JJ
  69. ``/``
  70. irregularities/NNS
  71. ''/''
  72. in/IN
  73. the/AT
  74. hard-fought/JJ
  75. primary/NN
  76. which/WDT
  77. was/BEDZ
  78. won/VBN
  79. by/IN
  80. (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
  81. ./.)"""
  82. )