test_collocations.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. # -*- coding: utf-8 -*-
  2. import unittest
  3. from nltk.collocations import BigramCollocationFinder
  4. from nltk.metrics import BigramAssocMeasures
  5. ## Test bigram counters with discontinuous bigrams and repeated words
  6. _EPSILON = 1e-8
  7. def close_enough(x, y):
  8. """Verify that two sequences of n-gram association values are within
  9. _EPSILON of each other.
  10. """
  11. for (x1, y1) in zip(x, y):
  12. if x1[0] != y1[0] or abs(x1[1] - y1[1]) > _EPSILON:
  13. return False
  14. return True
  15. class TestBigram(unittest.TestCase):
  16. def test_bigram2(self):
  17. sent = 'this this is is a a test test'.split()
  18. b = BigramCollocationFinder.from_words(sent)
  19. # python 2.6 does not have assertItemsEqual or assertListEqual
  20. self.assertEqual(
  21. sorted(b.ngram_fd.items()),
  22. sorted(
  23. [
  24. (('a', 'a'), 1),
  25. (('a', 'test'), 1),
  26. (('is', 'a'), 1),
  27. (('is', 'is'), 1),
  28. (('test', 'test'), 1),
  29. (('this', 'is'), 1),
  30. (('this', 'this'), 1),
  31. ]
  32. ),
  33. )
  34. self.assertEqual(
  35. sorted(b.word_fd.items()),
  36. sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
  37. )
  38. self.assertTrue(
  39. len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
  40. )
  41. self.assertTrue(
  42. close_enough(
  43. sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
  44. sorted(
  45. [
  46. (('a', 'a'), 1.0),
  47. (('a', 'test'), 1.0),
  48. (('is', 'a'), 1.0),
  49. (('is', 'is'), 1.0),
  50. (('test', 'test'), 1.0),
  51. (('this', 'is'), 1.0),
  52. (('this', 'this'), 1.0),
  53. ]
  54. ),
  55. )
  56. )
  57. def test_bigram3(self):
  58. sent = 'this this is is a a test test'.split()
  59. b = BigramCollocationFinder.from_words(sent, window_size=3)
  60. self.assertEqual(
  61. sorted(b.ngram_fd.items()),
  62. sorted(
  63. [
  64. (('a', 'test'), 3),
  65. (('is', 'a'), 3),
  66. (('this', 'is'), 3),
  67. (('a', 'a'), 1),
  68. (('is', 'is'), 1),
  69. (('test', 'test'), 1),
  70. (('this', 'this'), 1),
  71. ]
  72. ),
  73. )
  74. self.assertEqual(
  75. sorted(b.word_fd.items()),
  76. sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
  77. )
  78. self.assertTrue(
  79. len(sent)
  80. == sum(b.word_fd.values())
  81. == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
  82. )
  83. self.assertTrue(
  84. close_enough(
  85. sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
  86. sorted(
  87. [
  88. (('a', 'test'), 1.584962500721156),
  89. (('is', 'a'), 1.584962500721156),
  90. (('this', 'is'), 1.584962500721156),
  91. (('a', 'a'), 0.0),
  92. (('is', 'is'), 0.0),
  93. (('test', 'test'), 0.0),
  94. (('this', 'this'), 0.0),
  95. ]
  96. ),
  97. )
  98. )
  99. def test_bigram5(self):
  100. sent = 'this this is is a a test test'.split()
  101. b = BigramCollocationFinder.from_words(sent, window_size=5)
  102. self.assertEqual(
  103. sorted(b.ngram_fd.items()),
  104. sorted(
  105. [
  106. (('a', 'test'), 4),
  107. (('is', 'a'), 4),
  108. (('this', 'is'), 4),
  109. (('is', 'test'), 3),
  110. (('this', 'a'), 3),
  111. (('a', 'a'), 1),
  112. (('is', 'is'), 1),
  113. (('test', 'test'), 1),
  114. (('this', 'this'), 1),
  115. ]
  116. ),
  117. )
  118. self.assertEqual(
  119. sorted(b.word_fd.items()),
  120. sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
  121. )
  122. self.assertTrue(
  123. len(sent)
  124. == sum(b.word_fd.values())
  125. == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
  126. )
  127. self.assertTrue(
  128. close_enough(
  129. sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
  130. sorted(
  131. [
  132. (('a', 'test'), 1.0),
  133. (('is', 'a'), 1.0),
  134. (('this', 'is'), 1.0),
  135. (('is', 'test'), 0.5849625007211562),
  136. (('this', 'a'), 0.5849625007211562),
  137. (('a', 'a'), -1.0),
  138. (('is', 'is'), -1.0),
  139. (('test', 'test'), -1.0),
  140. (('this', 'this'), -1.0),
  141. ]
  142. ),
  143. )
  144. )