test_bleu.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests for BLEU translation evaluation metric
  4. """
  5. import functools
  6. import io
  7. import unittest
  8. from nltk.data import find
  9. from nltk.translate.bleu_score import (
  10. modified_precision,
  11. brevity_penalty,
  12. closest_ref_length,
  13. )
  14. from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
  15. class TestBLEU(unittest.TestCase):
  16. def test_modified_precision(self):
  17. """
  18. Examples from the original BLEU paper
  19. http://www.aclweb.org/anthology/P02-1040.pdf
  20. """
  21. # Example 1: the "the*" example.
  22. # Reference sentences.
  23. ref1 = 'the cat is on the mat'.split()
  24. ref2 = 'there is a cat on the mat'.split()
  25. # Hypothesis sentence(s).
  26. hyp1 = 'the the the the the the the'.split()
  27. references = [ref1, ref2]
  28. # Testing modified unigram precision.
  29. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
  30. assert round(hyp1_unigram_precision, 4) == 0.2857
  31. # With assertAlmostEqual at 4 place precision.
  32. self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
  33. # Testing modified bigram precision.
  34. assert float(modified_precision(references, hyp1, n=2)) == 0.0
  35. # Example 2: the "of the" example.
  36. # Reference sentences
  37. ref1 = str(
  38. 'It is a guide to action that ensures that the military '
  39. 'will forever heed Party commands'
  40. ).split()
  41. ref2 = str(
  42. 'It is the guiding principle which guarantees the military '
  43. 'forces always being under the command of the Party'
  44. ).split()
  45. ref3 = str(
  46. 'It is the practical guide for the army always to heed '
  47. 'the directions of the party'
  48. ).split()
  49. # Hypothesis sentence(s).
  50. hyp1 = 'of the'.split()
  51. references = [ref1, ref2, ref3]
  52. # Testing modified unigram precision.
  53. assert float(modified_precision(references, hyp1, n=1)) == 1.0
  54. # Testing modified bigram precision.
  55. assert float(modified_precision(references, hyp1, n=2)) == 1.0
  56. # Example 3: Proper MT outputs.
  57. hyp1 = str(
  58. 'It is a guide to action which ensures that the military '
  59. 'always obeys the commands of the party'
  60. ).split()
  61. hyp2 = str(
  62. 'It is to insure the troops forever hearing the activity '
  63. 'guidebook that party direct'
  64. ).split()
  65. references = [ref1, ref2, ref3]
  66. # Unigram precision.
  67. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
  68. hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
  69. # Test unigram precision with assertAlmostEqual at 4 place precision.
  70. self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
  71. self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
  72. # Test unigram precision with rounding.
  73. assert round(hyp1_unigram_precision, 4) == 0.9444
  74. assert round(hyp2_unigram_precision, 4) == 0.5714
  75. # Bigram precision
  76. hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
  77. hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
  78. # Test bigram precision with assertAlmostEqual at 4 place precision.
  79. self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
  80. self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
  81. # Test bigram precision with rounding.
  82. assert round(hyp1_bigram_precision, 4) == 0.5882
  83. assert round(hyp2_bigram_precision, 4) == 0.0769
  84. def test_brevity_penalty(self):
  85. # Test case from brevity_penalty_closest function in mteval-v13a.pl.
  86. # Same test cases as in the doctest in nltk.translate.bleu_score.py
  87. references = [['a'] * 11, ['a'] * 8]
  88. hypothesis = ['a'] * 7
  89. hyp_len = len(hypothesis)
  90. closest_ref_len = closest_ref_length(references, hyp_len)
  91. self.assertAlmostEqual(
  92. brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
  93. )
  94. references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
  95. hypothesis = ['a'] * 7
  96. hyp_len = len(hypothesis)
  97. closest_ref_len = closest_ref_length(references, hyp_len)
  98. assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
  99. def test_zero_matches(self):
  100. # Test case where there's 0 matches
  101. references = ['The candidate has no alignment to any of the references'.split()]
  102. hypothesis = 'John loves Mary'.split()
  103. # Test BLEU to nth order of n-grams, where n is len(hypothesis).
  104. for n in range(1, len(hypothesis)):
  105. weights = [1.0 / n] * n # Uniform weights.
  106. assert sentence_bleu(references, hypothesis, weights) == 0
  107. def test_full_matches(self):
  108. # Test case where there's 100% matches
  109. references = ['John loves Mary'.split()]
  110. hypothesis = 'John loves Mary'.split()
  111. # Test BLEU to nth order of n-grams, where n is len(hypothesis).
  112. for n in range(1, len(hypothesis)):
  113. weights = [1.0 / n] * n # Uniform weights.
  114. assert sentence_bleu(references, hypothesis, weights) == 1.0
  115. def test_partial_matches_hypothesis_longer_than_reference(self):
  116. references = ['John loves Mary'.split()]
  117. hypothesis = 'John loves Mary who loves Mike'.split()
  118. # Since no 4-grams matches were found the result should be zero
  119. # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
  120. self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
  121. # Checks that the warning has been raised because len(reference) < 4.
  122. try:
  123. self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
  124. except AttributeError:
  125. pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
  126. # @unittest.skip("Skipping fringe cases for BLEU.")
  127. class TestBLEUFringeCases(unittest.TestCase):
  128. def test_case_where_n_is_bigger_than_hypothesis_length(self):
  129. # Test BLEU to nth order of n-grams, where n > len(hypothesis).
  130. references = ['John loves Mary ?'.split()]
  131. hypothesis = 'John loves Mary'.split()
  132. n = len(hypothesis) + 1 #
  133. weights = [1.0 / n] * n # Uniform weights.
  134. # Since no n-grams matches were found the result should be zero
  135. # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
  136. self.assertAlmostEqual(
  137. sentence_bleu(references, hypothesis, weights), 0.0, places=4
  138. )
  139. # Checks that the warning has been raised because len(hypothesis) < 4.
  140. try:
  141. self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
  142. except AttributeError:
  143. pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
  144. # Test case where n > len(hypothesis) but so is n > len(reference), and
  145. # it's a special case where reference == hypothesis.
  146. references = ['John loves Mary'.split()]
  147. hypothesis = 'John loves Mary'.split()
  148. # Since no 4-grams matches were found the result should be zero
  149. # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
  150. self.assertAlmostEqual(
  151. sentence_bleu(references, hypothesis, weights), 0.0, places=4
  152. )
  153. def test_empty_hypothesis(self):
  154. # Test case where there's hypothesis is empty.
  155. references = ['The candidate has no alignment to any of the references'.split()]
  156. hypothesis = []
  157. assert sentence_bleu(references, hypothesis) == 0
  158. def test_empty_references(self):
  159. # Test case where there's reference is empty.
  160. references = [[]]
  161. hypothesis = 'John loves Mary'.split()
  162. assert sentence_bleu(references, hypothesis) == 0
  163. def test_empty_references_and_hypothesis(self):
  164. # Test case where both references and hypothesis is empty.
  165. references = [[]]
  166. hypothesis = []
  167. assert sentence_bleu(references, hypothesis) == 0
  168. def test_reference_or_hypothesis_shorter_than_fourgrams(self):
  169. # Tese case where the length of reference or hypothesis
  170. # is shorter than 4.
  171. references = ['let it go'.split()]
  172. hypothesis = 'let go it'.split()
  173. # Checks that the value the hypothesis and reference returns is 0.0
  174. # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
  175. self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
  176. # Checks that the warning has been raised.
  177. try:
  178. self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
  179. except AttributeError:
  180. pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
  181. class TestBLEUvsMteval13a(unittest.TestCase):
  182. def test_corpus_bleu(self):
  183. ref_file = find('models/wmt15_eval/ref.ru')
  184. hyp_file = find('models/wmt15_eval/google.ru')
  185. mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
  186. # Reads the BLEU scores from the `mteval-13a.output` file.
  187. # The order of the list corresponds to the order of the ngrams.
  188. with open(mteval_output_file, 'r') as mteval_fin:
  189. # The numbers are located in the last 2nd line of the file.
  190. # The first and 2nd item in the list are the score and system names.
  191. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
  192. with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
  193. with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
  194. # Whitespace tokenize the file.
  195. # Note: split() automatically strip().
  196. hypothesis = list(map(lambda x: x.split(), hyp_fin))
  197. # Note that the corpus_bleu input is list of list of references.
  198. references = list(map(lambda x: [x.split()], ref_fin))
  199. # Without smoothing.
  200. for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
  201. nltk_bleu = corpus_bleu(
  202. references, hypothesis, weights=(1.0 / i,) * i
  203. )
  204. # Check that the BLEU scores difference is less than 0.005 .
  205. # Note: This is an approximate comparison; as much as
  206. # +/- 0.01 BLEU might be "statistically significant",
  207. # the actual translation quality might not be.
  208. assert abs(mteval_bleu - nltk_bleu) < 0.005
  209. # With the same smoothing method used in mteval-v13a.pl
  210. chencherry = SmoothingFunction()
  211. for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
  212. nltk_bleu = corpus_bleu(
  213. references,
  214. hypothesis,
  215. weights=(1.0 / i,) * i,
  216. smoothing_function=chencherry.method3,
  217. )
  218. assert abs(mteval_bleu - nltk_bleu) < 0.005
  219. class TestBLEUWithBadSentence(unittest.TestCase):
  220. def test_corpus_bleu_with_bad_sentence(self):
  221. hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
  222. ref = str(
  223. "Their tasks include changing a pump on the faulty stokehold ."
  224. "Likewise , two species that are very similar in morphology "
  225. "were distinguished using genetics ."
  226. )
  227. references = [[ref.split()]]
  228. hypotheses = [hyp.split()]
  229. try: # Check that the warning is raised since no. of 2-grams < 0.
  230. with self.assertWarns(UserWarning):
  231. # Verify that the BLEU output is undesired since no. of 2-grams < 0.
  232. self.assertAlmostEqual(
  233. corpus_bleu(references, hypotheses), 0.0, places=4
  234. )
  235. except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
  236. self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)