| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: BLEU Score
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
- # Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """BLEU score implementation."""
- import math
- import sys
- from fractions import Fraction
- import warnings
- from collections import Counter
- from nltk.util import ngrams
- def sentence_bleu(
- references,
- hypothesis,
- weights=(0.25, 0.25, 0.25, 0.25),
- smoothing_function=None,
- auto_reweigh=False,
- ):
- """
- Calculate BLEU score (Bilingual Evaluation Understudy) from
- Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
- "BLEU: a method for automatic evaluation of machine translation."
- In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
- ... 'ensures', 'that', 'the', 'military', 'always',
- ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
- ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
- ... 'that', 'party', 'direct']
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
- ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
- ... 'heed', 'Party', 'commands']
- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
- ... 'guarantees', 'the', 'military', 'forces', 'always',
- ... 'being', 'under', 'the', 'command', 'of', 'the',
- ... 'Party']
- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
- ... 'of', 'the', 'party']
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
- 0.5045...
- If there is no ngrams overlap for any order of n-grams, BLEU returns the
- value 0. This is because the precision for the order of n-grams without
- overlap is 0, and the geometric mean in the final BLEU score computation
- multiplies the 0 with the precision of other n-grams. This results in 0
- (independently of the precision of the othe n-gram orders). The following
- example has zero 3-gram and 4-gram overlaps:
- >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
- 0.0
- To avoid this harsh behaviour when no ngram overlaps are found a smoothing
- function can be used.
- >>> chencherry = SmoothingFunction()
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
- ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
- 0.0370...
- The default BLEU calculates a score for up to 4-grams using uniform
- weights (this is called BLEU-4). To evaluate your translations with
- higher/lower order ngrams, use customized weights. E.g. when accounting
- for up to 5-grams with uniform weights (this is called BLEU-5) use:
- >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
- 0.3920...
- :param references: reference sentences
- :type references: list(list(str))
- :param hypothesis: a hypothesis sentence
- :type hypothesis: list(str)
- :param weights: weights for unigrams, bigrams, trigrams and so on
- :type weights: list(float)
- :param smoothing_function:
- :type smoothing_function: SmoothingFunction
- :param auto_reweigh: Option to re-normalize the weights uniformly.
- :type auto_reweigh: bool
- :return: The sentence-level BLEU score.
- :rtype: float
- """
- return corpus_bleu(
- [references], [hypothesis], weights, smoothing_function, auto_reweigh
- )
- def corpus_bleu(
- list_of_references,
- hypotheses,
- weights=(0.25, 0.25, 0.25, 0.25),
- smoothing_function=None,
- auto_reweigh=False,
- ):
- """
- Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
- the hypotheses and their respective references.
- Instead of averaging the sentence level BLEU scores (i.e. marco-average
- precision), the original BLEU metric (Papineni et al. 2002) accounts for
- the micro-average precision (i.e. summing the numerators and denominators
- for each hypothesis-reference(s) pairs before the division).
- >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
- ... 'ensures', 'that', 'the', 'military', 'always',
- ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
- >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
- ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
- ... 'heed', 'Party', 'commands']
- >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
- ... 'guarantees', 'the', 'military', 'forces', 'always',
- ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
- >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
- ... 'of', 'the', 'party']
- >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
- ... 'interested', 'in', 'world', 'history']
- >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
- ... 'because', 'he', 'read', 'the', 'book']
- >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
- >>> hypotheses = [hyp1, hyp2]
- >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
- 0.5920...
- The example below show that corpus_bleu() is different from averaging
- sentence_bleu() for hypotheses
- >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
- >>> score2 = sentence_bleu([ref2a], hyp2)
- >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
- 0.6223...
- :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
- :type list_of_references: list(list(list(str)))
- :param hypotheses: a list of hypothesis sentences
- :type hypotheses: list(list(str))
- :param weights: weights for unigrams, bigrams, trigrams and so on
- :type weights: list(float)
- :param smoothing_function:
- :type smoothing_function: SmoothingFunction
- :param auto_reweigh: Option to re-normalize the weights uniformly.
- :type auto_reweigh: bool
- :return: The corpus-level BLEU score.
- :rtype: float
- """
- # Before proceeding to compute BLEU, perform sanity checks.
- p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
- p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
- hyp_lengths, ref_lengths = 0, 0
- assert len(list_of_references) == len(hypotheses), (
- "The number of hypotheses and their reference(s) should be the " "same "
- )
- # Iterate through each hypothesis and their corresponding references.
- for references, hypothesis in zip(list_of_references, hypotheses):
- # For each order of ngram, calculate the numerator and
- # denominator for the corpus-level modified precision.
- for i, _ in enumerate(weights, start=1):
- p_i = modified_precision(references, hypothesis, i)
- p_numerators[i] += p_i.numerator
- p_denominators[i] += p_i.denominator
- # Calculate the hypothesis length and the closest reference length.
- # Adds them to the corpus-level hypothesis and reference counts.
- hyp_len = len(hypothesis)
- hyp_lengths += hyp_len
- ref_lengths += closest_ref_length(references, hyp_len)
- # Calculate corpus-level brevity penalty.
- bp = brevity_penalty(ref_lengths, hyp_lengths)
- # Uniformly re-weighting based on maximum hypothesis lengths if largest
- # order of n-grams < 4 and weights is set at default.
- if auto_reweigh:
- if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
- weights = (1 / hyp_lengths,) * hyp_lengths
- # Collects the various precision values for the different ngram orders.
- p_n = [
- Fraction(p_numerators[i], p_denominators[i], _normalize=False)
- for i, _ in enumerate(weights, start=1)
- ]
- # Returns 0 if there's no matching n-grams
- # We only need to check for p_numerators[1] == 0, since if there's
- # no unigrams, there won't be any higher order ngrams.
- if p_numerators[1] == 0:
- return 0
- # If there's no smoothing, set use method0 from SmoothinFunction class.
- if not smoothing_function:
- smoothing_function = SmoothingFunction().method0
- # Smoothen the modified precision.
- # Note: smoothing_function() may convert values into floats;
- # it tries to retain the Fraction object as much as the
- # smoothing method allows.
- p_n = smoothing_function(
- p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
- )
- s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
- s = bp * math.exp(math.fsum(s))
- return s
- def modified_precision(references, hypothesis, n):
- """
- Calculate modified ngram precision.
- The normal precision method may lead to some wrong translations with
- high-precision, e.g., the translation, in which a word of reference
- repeats several times, has very high precision.
- This function only returns the Fraction object that contains the numerator
- and denominator necessary to calculate the corpus-level precision.
- To calculate the modified precision for a single pair of hypothesis and
- references, cast the Fraction object into a float.
- The famous "the the the ... " example shows that you can get BLEU precision
- by duplicating high frequency words.
- >>> reference1 = 'the cat is on the mat'.split()
- >>> reference2 = 'there is a cat on the mat'.split()
- >>> hypothesis1 = 'the the the the the the the'.split()
- >>> references = [reference1, reference2]
- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
- 0.2857...
- In the modified n-gram precision, a reference word will be considered
- exhausted after a matching hypothesis word is identified, e.g.
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
- ... 'ensures', 'that', 'the', 'military', 'will',
- ... 'forever', 'heed', 'Party', 'commands']
- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
- ... 'guarantees', 'the', 'military', 'forces', 'always',
- ... 'being', 'under', 'the', 'command', 'of', 'the',
- ... 'Party']
- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
- ... 'of', 'the', 'party']
- >>> hypothesis = 'of the'.split()
- >>> references = [reference1, reference2, reference3]
- >>> float(modified_precision(references, hypothesis, n=1))
- 1.0
- >>> float(modified_precision(references, hypothesis, n=2))
- 1.0
- An example of a normal machine translation hypothesis:
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
- ... 'ensures', 'that', 'the', 'military', 'always',
- ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
- ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
- ... 'that', 'party', 'direct']
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
- ... 'ensures', 'that', 'the', 'military', 'will',
- ... 'forever', 'heed', 'Party', 'commands']
- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
- ... 'guarantees', 'the', 'military', 'forces', 'always',
- ... 'being', 'under', 'the', 'command', 'of', 'the',
- ... 'Party']
- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
- ... 'of', 'the', 'party']
- >>> references = [reference1, reference2, reference3]
- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
- 0.9444...
- >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
- 0.5714...
- >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
- 0.5882352941176471
- >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
- 0.07692...
- :param references: A list of reference translations.
- :type references: list(list(str))
- :param hypothesis: A hypothesis translation.
- :type hypothesis: list(str)
- :param n: The ngram order.
- :type n: int
- :return: BLEU's modified precision for the nth order ngram.
- :rtype: Fraction
- """
- # Extracts all ngrams in hypothesis
- # Set an empty Counter if hypothesis is empty.
- counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
- # Extract a union of references' counts.
- # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
- max_counts = {}
- for reference in references:
- reference_counts = (
- Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
- )
- for ngram in counts:
- max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
- # Assigns the intersection between hypothesis and references' counts.
- clipped_counts = {
- ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
- }
- numerator = sum(clipped_counts.values())
- # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
- # Usually this happens when the ngram order is > len(reference).
- denominator = max(1, sum(counts.values()))
- return Fraction(numerator, denominator, _normalize=False)
- def closest_ref_length(references, hyp_len):
- """
- This function finds the reference that is the closest length to the
- hypothesis. The closest reference length is referred to as *r* variable
- from the brevity penalty formula in Papineni et. al. (2002)
- :param references: A list of reference translations.
- :type references: list(list(str))
- :param hyp_len: The length of the hypothesis.
- :type hyp_len: int
- :return: The length of the reference that's closest to the hypothesis.
- :rtype: int
- """
- ref_lens = (len(reference) for reference in references)
- closest_ref_len = min(
- ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
- )
- return closest_ref_len
- def brevity_penalty(closest_ref_len, hyp_len):
- """
- Calculate brevity penalty.
- As the modified n-gram precision still has the problem from the short
- length sentence, brevity penalty is used to modify the overall BLEU
- score according to length.
- An example from the paper. There are three references with length 12, 15
- and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
- >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
- >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
- >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
- >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
- >>> references = [reference1, reference2, reference3]
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
- >>> brevity_penalty(closest_ref_len, hyp_len)
- 1.0
- In case a hypothesis translation is shorter than the references, penalty is
- applied.
- >>> references = [['a'] * 28, ['a'] * 28]
- >>> hypothesis = ['a'] * 12
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
- >>> brevity_penalty(closest_ref_len, hyp_len)
- 0.2635971381157267
- The length of the closest reference is used to compute the penalty. If the
- length of a hypothesis is 12, and the reference lengths are 13 and 2, the
- penalty is applied because the hypothesis length (12) is less then the
- closest reference length (13).
- >>> references = [['a'] * 13, ['a'] * 2]
- >>> hypothesis = ['a'] * 12
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
- 0.9200...
- The brevity penalty doesn't depend on reference order. More importantly,
- when two reference sentences are at the same distance, the shortest
- reference sentence length is used.
- >>> references = [['a'] * 13, ['a'] * 11]
- >>> hypothesis = ['a'] * 12
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
- >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
- >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
- >>> bp1 == bp2 == 1
- True
- A test example from mteval-v13a.pl (starting from the line 705):
- >>> references = [['a'] * 11, ['a'] * 8]
- >>> hypothesis = ['a'] * 7
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
- 0.8668...
- >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
- >>> hypothesis = ['a'] * 7
- >>> hyp_len = len(hypothesis)
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
- >>> brevity_penalty(closest_ref_len, hyp_len)
- 1.0
- :param hyp_len: The length of the hypothesis for a single sentence OR the
- sum of all the hypotheses' lengths for a corpus
- :type hyp_len: int
- :param closest_ref_len: The length of the closest reference for a single
- hypothesis OR the sum of all the closest references for every hypotheses.
- :type closest_ref_len: int
- :return: BLEU's brevity penalty.
- :rtype: float
- """
- if hyp_len > closest_ref_len:
- return 1
- # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
- elif hyp_len == 0:
- return 0
- else:
- return math.exp(1 - closest_ref_len / hyp_len)
- class SmoothingFunction:
- """
- This is an implementation of the smoothing techniques
- for segment-level BLEU scores that was presented in
- Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
- Smoothing Techniques for Sentence-Level BLEU. In WMT14.
- http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
- """
- def __init__(self, epsilon=0.1, alpha=5, k=5):
- """
- This will initialize the parameters required for the various smoothing
- techniques, the default values are set to the numbers used in the
- experiments from Chen and Cherry (2014).
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
- ... 'that', 'the', 'military', 'always', 'obeys', 'the',
- ... 'commands', 'of', 'the', 'party']
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
- ... 'that', 'the', 'military', 'will', 'forever', 'heed',
- ... 'Party', 'commands']
- >>> chencherry = SmoothingFunction()
- >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
- 0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
- 0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
- 0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
- 0.4489...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
- 0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
- 0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
- 0.4905...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
- 0.4135...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
- 0.4905...
- :param epsilon: the epsilon value use in method 1
- :type epsilon: float
- :param alpha: the alpha value use in method 6
- :type alpha: int
- :param k: the k value use in method 4
- :type k: int
- """
- self.epsilon = epsilon
- self.alpha = alpha
- self.k = k
- def method0(self, p_n, *args, **kwargs):
- """
- No smoothing.
- """
- p_n_new = []
- for i, p_i in enumerate(p_n):
- if p_i.numerator != 0:
- p_n_new.append(p_i)
- else:
- _msg = str(
- "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
- "Therefore the BLEU score evaluates to 0, independently of\n"
- "how many N-gram overlaps of lower order it contains.\n"
- "Consider using lower n-gram order or use "
- "SmoothingFunction()"
- ).format(i + 1)
- warnings.warn(_msg)
- # When numerator==0 where denonminator==0 or !=0, the result
- # for the precision score should be equal to 0 or undefined.
- # Due to BLEU geometric mean computation in logarithm space,
- # we we need to take the return sys.float_info.min such that
- # math.log(sys.float_info.min) returns a 0 precision score.
- p_n_new.append(sys.float_info.min)
- return p_n_new
- def method1(self, p_n, *args, **kwargs):
- """
- Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
- """
- return [
- (p_i.numerator + self.epsilon) / p_i.denominator
- if p_i.numerator == 0
- else p_i
- for p_i in p_n
- ]
- def method2(self, p_n, *args, **kwargs):
- """
- Smoothing method 2: Add 1 to both numerator and denominator from
- Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
- machine translation quality using longest common subsequence and
- skip-bigram statistics. In ACL04.
- """
- return [
- Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False)
- for p_i in p_n
- ]
- def method3(self, p_n, *args, **kwargs):
- """
- Smoothing method 3: NIST geometric sequence smoothing
- The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
- precision score whose matching n-gram count is null.
- k is 1 for the first 'n' value for which the n-gram match count is null/
- For example, if the text contains:
- - one 2-gram match
- - and (consequently) two 1-gram matches
- the n-gram count for each individual precision score would be:
- - n=1 => prec_count = 2 (two unigrams)
- - n=2 => prec_count = 1 (one bigram)
- - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
- - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
- """
- incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
- for i, p_i in enumerate(p_n):
- if p_i.numerator == 0:
- p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
- incvnt += 1
- return p_n
- def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
- """
- Smoothing method 4:
- Shorter translations may have inflated precision values due to having
- smaller denominators; therefore, we give them proportionally
- smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
- suggests dividing by 1/ln(len(T)), where T is the length of the translation.
- """
- hyp_len = hyp_len if hyp_len else len(hypothesis)
- for i, p_i in enumerate(p_n):
- if p_i.numerator == 0 and hyp_len != 0:
- incvnt = i + 1 * self.k / math.log(
- hyp_len
- ) # Note that this K is different from the K from NIST.
- p_n[i] = incvnt / p_i.denominator
- return p_n
- def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
- """
- Smoothing method 5:
- The matched counts for similar values of n should be similar. To a
- calculate the n-gram matched count, it averages the n−1, n and n+1 gram
- matched counts.
- """
- hyp_len = hyp_len if hyp_len else len(hypothesis)
- m = {}
- # Requires an precision value for an addition ngram order.
- p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
- m[-1] = p_n[0] + 1
- for i, p_i in enumerate(p_n):
- p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
- m[i] = p_n[i]
- return p_n
- def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
- """
- Smoothing method 6:
- Interpolates the maximum likelihood estimate of the precision *p_n* with
- a prior estimate *pi0*. The prior is estimated by assuming that the ratio
- between pn and pn−1 will be the same as that between pn−1 and pn−2; from
- Gao and He (2013) Training MRF-Based Phrase Translation Models using
- Gradient Ascent. In NAACL.
- """
- hyp_len = hyp_len if hyp_len else len(hypothesis)
- # This smoothing only works when p_1 and p_2 is non-zero.
- # Raise an error with an appropriate message when the input is too short
- # to use this smoothing technique.
- assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
- for i, p_i in enumerate(p_n):
- if i in [0, 1]: # Skips the first 2 orders of ngrams.
- continue
- else:
- pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
- # No. of ngrams in translation that matches the reference.
- m = p_i.numerator
- # No. of ngrams in translation.
- l = sum(1 for _ in ngrams(hypothesis, i + 1))
- # Calculates the interpolated precision.
- p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
- return p_n
- def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
- """
- Smoothing method 7:
- Interpolates methods 5 and 6.
- """
- hyp_len = hyp_len if hyp_len else len(hypothesis)
- p_n = self.method4(p_n, references, hypothesis, hyp_len)
- p_n = self.method5(p_n, references, hypothesis, hyp_len)
- return p_n
|