gleu_score.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: GLEU Score
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Authors:
  6. # Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """ GLEU score implementation. """
  10. from collections import Counter
  11. from nltk.util import ngrams, everygrams
  12. def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
  13. """
  14. Calculates the sentence level GLEU (Google-BLEU) score described in
  15. Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
  16. Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
  17. Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
  18. Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
  19. George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
  20. Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
  21. Jeffrey Dean. (2016) Google’s Neural Machine Translation System:
  22. Bridging the Gap between Human and Machine Translation.
  23. eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
  24. Retrieved on 27 Oct 2016.
  25. From Wu et al. (2016):
  26. "The BLEU score has some undesirable properties when used for single
  27. sentences, as it was designed to be a corpus measure. We therefore
  28. use a slightly different score for our RL experiments which we call
  29. the 'GLEU score'. For the GLEU score, we record all sub-sequences of
  30. 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
  31. compute a recall, which is the ratio of the number of matching n-grams
  32. to the number of total n-grams in the target (ground truth) sequence,
  33. and a precision, which is the ratio of the number of matching n-grams
  34. to the number of total n-grams in the generated output sequence. Then
  35. GLEU score is simply the minimum of recall and precision. This GLEU
  36. score's range is always between 0 (no matches) and 1 (all match) and
  37. it is symmetrical when switching output and target. According to
  38. our experiments, GLEU score correlates quite well with the BLEU
  39. metric on a corpus level but does not have its drawbacks for our per
  40. sentence reward objective."
  41. Note: The initial implementation only allowed a single reference, but now
  42. a list of references is required (which is consistent with
  43. bleu_score.sentence_bleu()).
  44. The infamous "the the the ... " example
  45. >>> ref = 'the cat is on the mat'.split()
  46. >>> hyp = 'the the the the the the the'.split()
  47. >>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS
  48. 0.0909...
  49. An example to evaluate normal machine translation outputs
  50. >>> ref1 = str('It is a guide to action that ensures that the military '
  51. ... 'will forever heed Party commands').split()
  52. >>> hyp1 = str('It is a guide to action which ensures that the military '
  53. ... 'always obeys the commands of the party').split()
  54. >>> hyp2 = str('It is to insure the troops forever hearing the activity '
  55. ... 'guidebook that party direct').split()
  56. >>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS
  57. 0.4393...
  58. >>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS
  59. 0.1206...
  60. :param references: a list of reference sentences
  61. :type references: list(list(str))
  62. :param hypothesis: a hypothesis sentence
  63. :type hypothesis: list(str)
  64. :param min_len: The minimum order of n-gram this function should extract.
  65. :type min_len: int
  66. :param max_len: The maximum order of n-gram this function should extract.
  67. :type max_len: int
  68. :return: the sentence level GLEU score.
  69. :rtype: float
  70. """
  71. return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)
  72. def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
  73. """
  74. Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
  75. the hypotheses and their respective references.
  76. Instead of averaging the sentence level GLEU scores (i.e. macro-average
  77. precision), Wu et al. (2016) sum up the matching tokens and the max of
  78. hypothesis and reference tokens for each sentence, then compute using the
  79. aggregate values.
  80. From Mike Schuster (via email):
  81. "For the corpus, we just add up the two statistics n_match and
  82. n_all = max(n_all_output, n_all_target) for all sentences, then
  83. calculate gleu_score = n_match / n_all, so it is not just a mean of
  84. the sentence gleu scores (in our case, longer sentences count more,
  85. which I think makes sense as they are more difficult to translate)."
  86. >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
  87. ... 'ensures', 'that', 'the', 'military', 'always',
  88. ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
  89. >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
  90. ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
  91. ... 'heed', 'Party', 'commands']
  92. >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
  93. ... 'guarantees', 'the', 'military', 'forces', 'always',
  94. ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
  95. >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
  96. ... 'army', 'always', 'to', 'heed', 'the', 'directions',
  97. ... 'of', 'the', 'party']
  98. >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
  99. ... 'interested', 'in', 'world', 'history']
  100. >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
  101. ... 'because', 'he', 'read', 'the', 'book']
  102. >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
  103. >>> hypotheses = [hyp1, hyp2]
  104. >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
  105. 0.5673...
  106. The example below show that corpus_gleu() is different from averaging
  107. sentence_gleu() for hypotheses
  108. >>> score1 = sentence_gleu([ref1a], hyp1)
  109. >>> score2 = sentence_gleu([ref2a], hyp2)
  110. >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
  111. 0.6144...
  112. :param list_of_references: a list of reference sentences, w.r.t. hypotheses
  113. :type list_of_references: list(list(list(str)))
  114. :param hypotheses: a list of hypothesis sentences
  115. :type hypotheses: list(list(str))
  116. :param min_len: The minimum order of n-gram this function should extract.
  117. :type min_len: int
  118. :param max_len: The maximum order of n-gram this function should extract.
  119. :type max_len: int
  120. :return: The corpus-level GLEU score.
  121. :rtype: float
  122. """
  123. # sanity check
  124. assert len(list_of_references) == len(
  125. hypotheses
  126. ), "The number of hypotheses and their reference(s) should be the same"
  127. # sum matches and max-token-lengths over all sentences
  128. corpus_n_match = 0
  129. corpus_n_all = 0
  130. for references, hypothesis in zip(list_of_references, hypotheses):
  131. hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
  132. tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
  133. hyp_counts = []
  134. for reference in references:
  135. ref_ngrams = Counter(everygrams(reference, min_len, max_len))
  136. tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
  137. overlap_ngrams = ref_ngrams & hyp_ngrams
  138. tp = sum(overlap_ngrams.values()) # True positives.
  139. # While GLEU is defined as the minimum of precision and
  140. # recall, we can reduce the number of division operations by one by
  141. # instead finding the maximum of the denominators for the precision
  142. # and recall formulae, since the numerators are the same:
  143. # precision = tp / tpfp
  144. # recall = tp / tpfn
  145. # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
  146. n_all = max(tpfp, tpfn)
  147. if n_all > 0:
  148. hyp_counts.append((tp, n_all))
  149. # use the reference yielding the highest score
  150. if hyp_counts:
  151. n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
  152. corpus_n_match += n_match
  153. corpus_n_all += n_all
  154. # corner case: empty corpus or empty references---don't divide by zero!
  155. if corpus_n_all == 0:
  156. gleu_score = 0.0
  157. else:
  158. gleu_score = corpus_n_match / corpus_n_all
  159. return gleu_score