| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: ChrF score
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Authors: Maja Popovic
- # Contributors: Liling Tan, Aleš Tamchyna (Memsource)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """ ChrF score implementation """
- from collections import Counter, defaultdict
- import re
- from nltk.util import ngrams
- def sentence_chrf(
- reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
- ):
- """
- Calculates the sentence level CHRF (Character n-gram F-score) described in
- - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
- In Proceedings of the 10th Workshop on Machine Translation.
- http://www.statmt.org/wmt15/pdf/WMT49.pdf
- - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
- In Proceedings of the 1st Conference on Machine Translation.
- http://www.statmt.org/wmt16/pdf/W16-2341.pdf
- This implementation of CHRF only supports a single reference at the moment.
- For details not reported in the paper, consult Maja Popovic's original
- implementation: https://github.com/m-popovic/chrF
- The code should output results equivalent to running CHRF++ with the
- following options: -nw 0 -b 3
- An example from the original BLEU paper
- http://www.aclweb.org/anthology/P02-1040.pdf
- >>> ref1 = str('It is a guide to action that ensures that the military '
- ... 'will forever heed Party commands').split()
- >>> hyp1 = str('It is a guide to action which ensures that the military '
- ... 'always obeys the commands of the party').split()
- >>> hyp2 = str('It is to insure the troops forever hearing the activity '
- ... 'guidebook that party direct').split()
- >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
- 0.6349...
- >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
- 0.3330...
- The infamous "the the the ... " example
- >>> ref = 'the cat is on the mat'.split()
- >>> hyp = 'the the the the the the the'.split()
- >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
- 0.1468...
- An example to show that this function allows users to use strings instead of
- tokens, i.e. list(str) as inputs.
- >>> ref1 = str('It is a guide to action that ensures that the military '
- ... 'will forever heed Party commands')
- >>> hyp1 = str('It is a guide to action which ensures that the military '
- ... 'always obeys the commands of the party')
- >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
- 0.6349...
- >>> type(ref1) == type(hyp1) == str
- True
- >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
- 0.6349...
- To skip the unigrams and only use 2- to 3-grams:
- >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
- 0.6617...
- :param references: reference sentence
- :type references: list(str) / str
- :param hypothesis: a hypothesis sentence
- :type hypothesis: list(str) / str
- :param min_len: The minimum order of n-gram this function should extract.
- :type min_len: int
- :param max_len: The maximum order of n-gram this function should extract.
- :type max_len: int
- :param beta: the parameter to assign more importance to recall over precision
- :type beta: float
- :param ignore_whitespace: ignore whitespace characters in scoring
- :type ignore_whitespace: bool
- :return: the sentence level CHRF score.
- :rtype: float
- """
- return corpus_chrf(
- [reference],
- [hypothesis],
- min_len,
- max_len,
- beta=beta,
- ignore_whitespace=ignore_whitespace,
- )
- def _preprocess(sent, ignore_whitespace):
- if type(sent) != str:
- # turn list of tokens into a string
- sent = " ".join(sent)
- if ignore_whitespace:
- sent = re.sub(r"\s+", "", sent)
- return sent
- def chrf_precision_recall_fscore_support(
- reference, hypothesis, n, beta=3.0, epsilon=1e-16
- ):
- """
- This function computes the precision, recall and fscore from the ngram
- overlaps. It returns the `support` which is the true positive score.
- By underspecifying the input type, the function will be agnostic as to how
- it computes the ngrams and simply take the whichever element in the list;
- it could be either token or character.
- :param reference: The reference sentence.
- :type reference: list
- :param hypothesis: The hypothesis sentence.
- :type hypothesis: list
- :param n: Extract up to the n-th order ngrams
- :type n: int
- :param beta: The parameter to assign more importance to recall over precision.
- :type beta: float
- :param epsilon: The fallback value if the hypothesis or reference is empty.
- :type epsilon: float
- :return: Returns the precision, recall and f-score and support (true positive).
- :rtype: tuple(float)
- """
- ref_ngrams = Counter(ngrams(reference, n))
- hyp_ngrams = Counter(ngrams(hypothesis, n))
- # calculate the number of ngram matches
- overlap_ngrams = ref_ngrams & hyp_ngrams
- tp = sum(overlap_ngrams.values()) # True positives.
- tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
- tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
- try:
- prec = tp / tpfp # precision
- rec = tp / tpfn # recall
- factor = beta ** 2
- fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
- except ZeroDivisionError:
- prec = rec = fscore = epsilon
- return prec, rec, fscore, tp
- def corpus_chrf(
- references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
- ):
- """
- Calculates the corpus level CHRF (Character n-gram F-score), it is the
- macro-averaged value of the sentence/segment level CHRF score.
- This implementation of CHRF only supports a single reference at the moment.
- >>> ref1 = str('It is a guide to action that ensures that the military '
- ... 'will forever heed Party commands').split()
- >>> ref2 = str('It is the guiding principle which guarantees the military '
- ... 'forces always being under the command of the Party').split()
- >>>
- >>> hyp1 = str('It is a guide to action which ensures that the military '
- ... 'always obeys the commands of the party').split()
- >>> hyp2 = str('It is to insure the troops forever hearing the activity '
- ... 'guidebook that party direct')
- >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
- 0.3910...
- :param references: a corpus of list of reference sentences, w.r.t. hypotheses
- :type references: list(list(str))
- :param hypotheses: a list of hypothesis sentences
- :type hypotheses: list(list(str))
- :param min_len: The minimum order of n-gram this function should extract.
- :type min_len: int
- :param max_len: The maximum order of n-gram this function should extract.
- :type max_len: int
- :param beta: the parameter to assign more importance to recall over precision
- :type beta: float
- :param ignore_whitespace: ignore whitespace characters in scoring
- :type ignore_whitespace: bool
- :return: the sentence level CHRF score.
- :rtype: float
- """
- assert len(references) == len(
- hypotheses
- ), "The number of hypotheses and their references should be the same"
- num_sents = len(hypotheses)
- # Keep f-scores for each n-gram order separate
- ngram_fscores = defaultdict(lambda: list())
- # Iterate through each hypothesis and their corresponding references.
- for reference, hypothesis in zip(references, hypotheses):
- # preprocess both reference and hypothesis
- reference = _preprocess(reference, ignore_whitespace)
- hypothesis = _preprocess(hypothesis, ignore_whitespace)
- # Calculate f-scores for each sentence and for each n-gram order
- # separately.
- for n in range(min_len, max_len + 1):
- # Compute the precision, recall, fscore and support.
- prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
- reference, hypothesis, n, beta=beta
- )
- ngram_fscores[n].append(fscore)
- # how many n-gram sizes
- num_ngram_sizes = len(ngram_fscores)
- # sum of f-scores over all sentences for each n-gram order
- total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
- # macro-average over n-gram orders and over all sentences
- return (sum(total_scores) / num_ngram_sizes) / num_sents
|