bleu_score.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: BLEU Score
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
  6. # Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """BLEU score implementation."""
  10. import math
  11. import sys
  12. from fractions import Fraction
  13. import warnings
  14. from collections import Counter
  15. from nltk.util import ngrams
  16. def sentence_bleu(
  17. references,
  18. hypothesis,
  19. weights=(0.25, 0.25, 0.25, 0.25),
  20. smoothing_function=None,
  21. auto_reweigh=False,
  22. ):
  23. """
  24. Calculate BLEU score (Bilingual Evaluation Understudy) from
  25. Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
  26. "BLEU: a method for automatic evaluation of machine translation."
  27. In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
  28. >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
  29. ... 'ensures', 'that', 'the', 'military', 'always',
  30. ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
  31. >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
  32. ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
  33. ... 'that', 'party', 'direct']
  34. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
  35. ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
  36. ... 'heed', 'Party', 'commands']
  37. >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
  38. ... 'guarantees', 'the', 'military', 'forces', 'always',
  39. ... 'being', 'under', 'the', 'command', 'of', 'the',
  40. ... 'Party']
  41. >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
  42. ... 'army', 'always', 'to', 'heed', 'the', 'directions',
  43. ... 'of', 'the', 'party']
  44. >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
  45. 0.5045...
  46. If there is no ngrams overlap for any order of n-grams, BLEU returns the
  47. value 0. This is because the precision for the order of n-grams without
  48. overlap is 0, and the geometric mean in the final BLEU score computation
  49. multiplies the 0 with the precision of other n-grams. This results in 0
  50. (independently of the precision of the othe n-gram orders). The following
  51. example has zero 3-gram and 4-gram overlaps:
  52. >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
  53. 0.0
  54. To avoid this harsh behaviour when no ngram overlaps are found a smoothing
  55. function can be used.
  56. >>> chencherry = SmoothingFunction()
  57. >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
  58. ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
  59. 0.0370...
  60. The default BLEU calculates a score for up to 4-grams using uniform
  61. weights (this is called BLEU-4). To evaluate your translations with
  62. higher/lower order ngrams, use customized weights. E.g. when accounting
  63. for up to 5-grams with uniform weights (this is called BLEU-5) use:
  64. >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
  65. >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
  66. 0.3920...
  67. :param references: reference sentences
  68. :type references: list(list(str))
  69. :param hypothesis: a hypothesis sentence
  70. :type hypothesis: list(str)
  71. :param weights: weights for unigrams, bigrams, trigrams and so on
  72. :type weights: list(float)
  73. :param smoothing_function:
  74. :type smoothing_function: SmoothingFunction
  75. :param auto_reweigh: Option to re-normalize the weights uniformly.
  76. :type auto_reweigh: bool
  77. :return: The sentence-level BLEU score.
  78. :rtype: float
  79. """
  80. return corpus_bleu(
  81. [references], [hypothesis], weights, smoothing_function, auto_reweigh
  82. )
  83. def corpus_bleu(
  84. list_of_references,
  85. hypotheses,
  86. weights=(0.25, 0.25, 0.25, 0.25),
  87. smoothing_function=None,
  88. auto_reweigh=False,
  89. ):
  90. """
  91. Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
  92. the hypotheses and their respective references.
  93. Instead of averaging the sentence level BLEU scores (i.e. marco-average
  94. precision), the original BLEU metric (Papineni et al. 2002) accounts for
  95. the micro-average precision (i.e. summing the numerators and denominators
  96. for each hypothesis-reference(s) pairs before the division).
  97. >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
  98. ... 'ensures', 'that', 'the', 'military', 'always',
  99. ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
  100. >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
  101. ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
  102. ... 'heed', 'Party', 'commands']
  103. >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
  104. ... 'guarantees', 'the', 'military', 'forces', 'always',
  105. ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
  106. >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
  107. ... 'army', 'always', 'to', 'heed', 'the', 'directions',
  108. ... 'of', 'the', 'party']
  109. >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
  110. ... 'interested', 'in', 'world', 'history']
  111. >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
  112. ... 'because', 'he', 'read', 'the', 'book']
  113. >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
  114. >>> hypotheses = [hyp1, hyp2]
  115. >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
  116. 0.5920...
  117. The example below show that corpus_bleu() is different from averaging
  118. sentence_bleu() for hypotheses
  119. >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
  120. >>> score2 = sentence_bleu([ref2a], hyp2)
  121. >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
  122. 0.6223...
  123. :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
  124. :type list_of_references: list(list(list(str)))
  125. :param hypotheses: a list of hypothesis sentences
  126. :type hypotheses: list(list(str))
  127. :param weights: weights for unigrams, bigrams, trigrams and so on
  128. :type weights: list(float)
  129. :param smoothing_function:
  130. :type smoothing_function: SmoothingFunction
  131. :param auto_reweigh: Option to re-normalize the weights uniformly.
  132. :type auto_reweigh: bool
  133. :return: The corpus-level BLEU score.
  134. :rtype: float
  135. """
  136. # Before proceeding to compute BLEU, perform sanity checks.
  137. p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
  138. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
  139. hyp_lengths, ref_lengths = 0, 0
  140. assert len(list_of_references) == len(hypotheses), (
  141. "The number of hypotheses and their reference(s) should be the " "same "
  142. )
  143. # Iterate through each hypothesis and their corresponding references.
  144. for references, hypothesis in zip(list_of_references, hypotheses):
  145. # For each order of ngram, calculate the numerator and
  146. # denominator for the corpus-level modified precision.
  147. for i, _ in enumerate(weights, start=1):
  148. p_i = modified_precision(references, hypothesis, i)
  149. p_numerators[i] += p_i.numerator
  150. p_denominators[i] += p_i.denominator
  151. # Calculate the hypothesis length and the closest reference length.
  152. # Adds them to the corpus-level hypothesis and reference counts.
  153. hyp_len = len(hypothesis)
  154. hyp_lengths += hyp_len
  155. ref_lengths += closest_ref_length(references, hyp_len)
  156. # Calculate corpus-level brevity penalty.
  157. bp = brevity_penalty(ref_lengths, hyp_lengths)
  158. # Uniformly re-weighting based on maximum hypothesis lengths if largest
  159. # order of n-grams < 4 and weights is set at default.
  160. if auto_reweigh:
  161. if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
  162. weights = (1 / hyp_lengths,) * hyp_lengths
  163. # Collects the various precision values for the different ngram orders.
  164. p_n = [
  165. Fraction(p_numerators[i], p_denominators[i], _normalize=False)
  166. for i, _ in enumerate(weights, start=1)
  167. ]
  168. # Returns 0 if there's no matching n-grams
  169. # We only need to check for p_numerators[1] == 0, since if there's
  170. # no unigrams, there won't be any higher order ngrams.
  171. if p_numerators[1] == 0:
  172. return 0
  173. # If there's no smoothing, set use method0 from SmoothinFunction class.
  174. if not smoothing_function:
  175. smoothing_function = SmoothingFunction().method0
  176. # Smoothen the modified precision.
  177. # Note: smoothing_function() may convert values into floats;
  178. # it tries to retain the Fraction object as much as the
  179. # smoothing method allows.
  180. p_n = smoothing_function(
  181. p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
  182. )
  183. s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
  184. s = bp * math.exp(math.fsum(s))
  185. return s
  186. def modified_precision(references, hypothesis, n):
  187. """
  188. Calculate modified ngram precision.
  189. The normal precision method may lead to some wrong translations with
  190. high-precision, e.g., the translation, in which a word of reference
  191. repeats several times, has very high precision.
  192. This function only returns the Fraction object that contains the numerator
  193. and denominator necessary to calculate the corpus-level precision.
  194. To calculate the modified precision for a single pair of hypothesis and
  195. references, cast the Fraction object into a float.
  196. The famous "the the the ... " example shows that you can get BLEU precision
  197. by duplicating high frequency words.
  198. >>> reference1 = 'the cat is on the mat'.split()
  199. >>> reference2 = 'there is a cat on the mat'.split()
  200. >>> hypothesis1 = 'the the the the the the the'.split()
  201. >>> references = [reference1, reference2]
  202. >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
  203. 0.2857...
  204. In the modified n-gram precision, a reference word will be considered
  205. exhausted after a matching hypothesis word is identified, e.g.
  206. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
  207. ... 'ensures', 'that', 'the', 'military', 'will',
  208. ... 'forever', 'heed', 'Party', 'commands']
  209. >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
  210. ... 'guarantees', 'the', 'military', 'forces', 'always',
  211. ... 'being', 'under', 'the', 'command', 'of', 'the',
  212. ... 'Party']
  213. >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
  214. ... 'army', 'always', 'to', 'heed', 'the', 'directions',
  215. ... 'of', 'the', 'party']
  216. >>> hypothesis = 'of the'.split()
  217. >>> references = [reference1, reference2, reference3]
  218. >>> float(modified_precision(references, hypothesis, n=1))
  219. 1.0
  220. >>> float(modified_precision(references, hypothesis, n=2))
  221. 1.0
  222. An example of a normal machine translation hypothesis:
  223. >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
  224. ... 'ensures', 'that', 'the', 'military', 'always',
  225. ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
  226. >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
  227. ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
  228. ... 'that', 'party', 'direct']
  229. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
  230. ... 'ensures', 'that', 'the', 'military', 'will',
  231. ... 'forever', 'heed', 'Party', 'commands']
  232. >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
  233. ... 'guarantees', 'the', 'military', 'forces', 'always',
  234. ... 'being', 'under', 'the', 'command', 'of', 'the',
  235. ... 'Party']
  236. >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
  237. ... 'army', 'always', 'to', 'heed', 'the', 'directions',
  238. ... 'of', 'the', 'party']
  239. >>> references = [reference1, reference2, reference3]
  240. >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
  241. 0.9444...
  242. >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
  243. 0.5714...
  244. >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
  245. 0.5882352941176471
  246. >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
  247. 0.07692...
  248. :param references: A list of reference translations.
  249. :type references: list(list(str))
  250. :param hypothesis: A hypothesis translation.
  251. :type hypothesis: list(str)
  252. :param n: The ngram order.
  253. :type n: int
  254. :return: BLEU's modified precision for the nth order ngram.
  255. :rtype: Fraction
  256. """
  257. # Extracts all ngrams in hypothesis
  258. # Set an empty Counter if hypothesis is empty.
  259. counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
  260. # Extract a union of references' counts.
  261. # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
  262. max_counts = {}
  263. for reference in references:
  264. reference_counts = (
  265. Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
  266. )
  267. for ngram in counts:
  268. max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
  269. # Assigns the intersection between hypothesis and references' counts.
  270. clipped_counts = {
  271. ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
  272. }
  273. numerator = sum(clipped_counts.values())
  274. # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
  275. # Usually this happens when the ngram order is > len(reference).
  276. denominator = max(1, sum(counts.values()))
  277. return Fraction(numerator, denominator, _normalize=False)
  278. def closest_ref_length(references, hyp_len):
  279. """
  280. This function finds the reference that is the closest length to the
  281. hypothesis. The closest reference length is referred to as *r* variable
  282. from the brevity penalty formula in Papineni et. al. (2002)
  283. :param references: A list of reference translations.
  284. :type references: list(list(str))
  285. :param hyp_len: The length of the hypothesis.
  286. :type hyp_len: int
  287. :return: The length of the reference that's closest to the hypothesis.
  288. :rtype: int
  289. """
  290. ref_lens = (len(reference) for reference in references)
  291. closest_ref_len = min(
  292. ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
  293. )
  294. return closest_ref_len
  295. def brevity_penalty(closest_ref_len, hyp_len):
  296. """
  297. Calculate brevity penalty.
  298. As the modified n-gram precision still has the problem from the short
  299. length sentence, brevity penalty is used to modify the overall BLEU
  300. score according to length.
  301. An example from the paper. There are three references with length 12, 15
  302. and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
  303. >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
  304. >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
  305. >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
  306. >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
  307. >>> references = [reference1, reference2, reference3]
  308. >>> hyp_len = len(hypothesis)
  309. >>> closest_ref_len = closest_ref_length(references, hyp_len)
  310. >>> brevity_penalty(closest_ref_len, hyp_len)
  311. 1.0
  312. In case a hypothesis translation is shorter than the references, penalty is
  313. applied.
  314. >>> references = [['a'] * 28, ['a'] * 28]
  315. >>> hypothesis = ['a'] * 12
  316. >>> hyp_len = len(hypothesis)
  317. >>> closest_ref_len = closest_ref_length(references, hyp_len)
  318. >>> brevity_penalty(closest_ref_len, hyp_len)
  319. 0.2635971381157267
  320. The length of the closest reference is used to compute the penalty. If the
  321. length of a hypothesis is 12, and the reference lengths are 13 and 2, the
  322. penalty is applied because the hypothesis length (12) is less then the
  323. closest reference length (13).
  324. >>> references = [['a'] * 13, ['a'] * 2]
  325. >>> hypothesis = ['a'] * 12
  326. >>> hyp_len = len(hypothesis)
  327. >>> closest_ref_len = closest_ref_length(references, hyp_len)
  328. >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
  329. 0.9200...
  330. The brevity penalty doesn't depend on reference order. More importantly,
  331. when two reference sentences are at the same distance, the shortest
  332. reference sentence length is used.
  333. >>> references = [['a'] * 13, ['a'] * 11]
  334. >>> hypothesis = ['a'] * 12
  335. >>> hyp_len = len(hypothesis)
  336. >>> closest_ref_len = closest_ref_length(references, hyp_len)
  337. >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
  338. >>> hyp_len = len(hypothesis)
  339. >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
  340. >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
  341. >>> bp1 == bp2 == 1
  342. True
  343. A test example from mteval-v13a.pl (starting from the line 705):
  344. >>> references = [['a'] * 11, ['a'] * 8]
  345. >>> hypothesis = ['a'] * 7
  346. >>> hyp_len = len(hypothesis)
  347. >>> closest_ref_len = closest_ref_length(references, hyp_len)
  348. >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
  349. 0.8668...
  350. >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
  351. >>> hypothesis = ['a'] * 7
  352. >>> hyp_len = len(hypothesis)
  353. >>> closest_ref_len = closest_ref_length(references, hyp_len)
  354. >>> brevity_penalty(closest_ref_len, hyp_len)
  355. 1.0
  356. :param hyp_len: The length of the hypothesis for a single sentence OR the
  357. sum of all the hypotheses' lengths for a corpus
  358. :type hyp_len: int
  359. :param closest_ref_len: The length of the closest reference for a single
  360. hypothesis OR the sum of all the closest references for every hypotheses.
  361. :type closest_ref_len: int
  362. :return: BLEU's brevity penalty.
  363. :rtype: float
  364. """
  365. if hyp_len > closest_ref_len:
  366. return 1
  367. # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
  368. elif hyp_len == 0:
  369. return 0
  370. else:
  371. return math.exp(1 - closest_ref_len / hyp_len)
  372. class SmoothingFunction:
  373. """
  374. This is an implementation of the smoothing techniques
  375. for segment-level BLEU scores that was presented in
  376. Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
  377. Smoothing Techniques for Sentence-Level BLEU. In WMT14.
  378. http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
  379. """
  380. def __init__(self, epsilon=0.1, alpha=5, k=5):
  381. """
  382. This will initialize the parameters required for the various smoothing
  383. techniques, the default values are set to the numbers used in the
  384. experiments from Chen and Cherry (2014).
  385. >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
  386. ... 'that', 'the', 'military', 'always', 'obeys', 'the',
  387. ... 'commands', 'of', 'the', 'party']
  388. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
  389. ... 'that', 'the', 'military', 'will', 'forever', 'heed',
  390. ... 'Party', 'commands']
  391. >>> chencherry = SmoothingFunction()
  392. >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
  393. 0.4118...
  394. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
  395. 0.4118...
  396. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
  397. 0.4118...
  398. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
  399. 0.4489...
  400. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
  401. 0.4118...
  402. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
  403. 0.4118...
  404. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
  405. 0.4905...
  406. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
  407. 0.4135...
  408. >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
  409. 0.4905...
  410. :param epsilon: the epsilon value use in method 1
  411. :type epsilon: float
  412. :param alpha: the alpha value use in method 6
  413. :type alpha: int
  414. :param k: the k value use in method 4
  415. :type k: int
  416. """
  417. self.epsilon = epsilon
  418. self.alpha = alpha
  419. self.k = k
  420. def method0(self, p_n, *args, **kwargs):
  421. """
  422. No smoothing.
  423. """
  424. p_n_new = []
  425. for i, p_i in enumerate(p_n):
  426. if p_i.numerator != 0:
  427. p_n_new.append(p_i)
  428. else:
  429. _msg = str(
  430. "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
  431. "Therefore the BLEU score evaluates to 0, independently of\n"
  432. "how many N-gram overlaps of lower order it contains.\n"
  433. "Consider using lower n-gram order or use "
  434. "SmoothingFunction()"
  435. ).format(i + 1)
  436. warnings.warn(_msg)
  437. # When numerator==0 where denonminator==0 or !=0, the result
  438. # for the precision score should be equal to 0 or undefined.
  439. # Due to BLEU geometric mean computation in logarithm space,
  440. # we we need to take the return sys.float_info.min such that
  441. # math.log(sys.float_info.min) returns a 0 precision score.
  442. p_n_new.append(sys.float_info.min)
  443. return p_n_new
  444. def method1(self, p_n, *args, **kwargs):
  445. """
  446. Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
  447. """
  448. return [
  449. (p_i.numerator + self.epsilon) / p_i.denominator
  450. if p_i.numerator == 0
  451. else p_i
  452. for p_i in p_n
  453. ]
  454. def method2(self, p_n, *args, **kwargs):
  455. """
  456. Smoothing method 2: Add 1 to both numerator and denominator from
  457. Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
  458. machine translation quality using longest common subsequence and
  459. skip-bigram statistics. In ACL04.
  460. """
  461. return [
  462. Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False)
  463. for p_i in p_n
  464. ]
  465. def method3(self, p_n, *args, **kwargs):
  466. """
  467. Smoothing method 3: NIST geometric sequence smoothing
  468. The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
  469. precision score whose matching n-gram count is null.
  470. k is 1 for the first 'n' value for which the n-gram match count is null/
  471. For example, if the text contains:
  472. - one 2-gram match
  473. - and (consequently) two 1-gram matches
  474. the n-gram count for each individual precision score would be:
  475. - n=1 => prec_count = 2 (two unigrams)
  476. - n=2 => prec_count = 1 (one bigram)
  477. - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
  478. - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
  479. """
  480. incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
  481. for i, p_i in enumerate(p_n):
  482. if p_i.numerator == 0:
  483. p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
  484. incvnt += 1
  485. return p_n
  486. def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
  487. """
  488. Smoothing method 4:
  489. Shorter translations may have inflated precision values due to having
  490. smaller denominators; therefore, we give them proportionally
  491. smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
  492. suggests dividing by 1/ln(len(T)), where T is the length of the translation.
  493. """
  494. hyp_len = hyp_len if hyp_len else len(hypothesis)
  495. for i, p_i in enumerate(p_n):
  496. if p_i.numerator == 0 and hyp_len != 0:
  497. incvnt = i + 1 * self.k / math.log(
  498. hyp_len
  499. ) # Note that this K is different from the K from NIST.
  500. p_n[i] = incvnt / p_i.denominator
  501. return p_n
  502. def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
  503. """
  504. Smoothing method 5:
  505. The matched counts for similar values of n should be similar. To a
  506. calculate the n-gram matched count, it averages the n−1, n and n+1 gram
  507. matched counts.
  508. """
  509. hyp_len = hyp_len if hyp_len else len(hypothesis)
  510. m = {}
  511. # Requires an precision value for an addition ngram order.
  512. p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
  513. m[-1] = p_n[0] + 1
  514. for i, p_i in enumerate(p_n):
  515. p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
  516. m[i] = p_n[i]
  517. return p_n
  518. def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
  519. """
  520. Smoothing method 6:
  521. Interpolates the maximum likelihood estimate of the precision *p_n* with
  522. a prior estimate *pi0*. The prior is estimated by assuming that the ratio
  523. between pn and pn−1 will be the same as that between pn−1 and pn−2; from
  524. Gao and He (2013) Training MRF-Based Phrase Translation Models using
  525. Gradient Ascent. In NAACL.
  526. """
  527. hyp_len = hyp_len if hyp_len else len(hypothesis)
  528. # This smoothing only works when p_1 and p_2 is non-zero.
  529. # Raise an error with an appropriate message when the input is too short
  530. # to use this smoothing technique.
  531. assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
  532. for i, p_i in enumerate(p_n):
  533. if i in [0, 1]: # Skips the first 2 orders of ngrams.
  534. continue
  535. else:
  536. pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
  537. # No. of ngrams in translation that matches the reference.
  538. m = p_i.numerator
  539. # No. of ngrams in translation.
  540. l = sum(1 for _ in ngrams(hypothesis, i + 1))
  541. # Calculates the interpolated precision.
  542. p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
  543. return p_n
  544. def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
  545. """
  546. Smoothing method 7:
  547. Interpolates methods 5 and 6.
  548. """
  549. hyp_len = hyp_len if hyp_len else len(hypothesis)
  550. p_n = self.method4(p_n, references, hypothesis, hyp_len)
  551. p_n = self.method5(p_n, references, hypothesis, hyp_len)
  552. return p_n