chrf_score.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: ChrF score
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Authors: Maja Popovic
  6. # Contributors: Liling Tan, Aleš Tamchyna (Memsource)
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """ ChrF score implementation """
  10. from collections import Counter, defaultdict
  11. import re
  12. from nltk.util import ngrams
  13. def sentence_chrf(
  14. reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
  15. ):
  16. """
  17. Calculates the sentence level CHRF (Character n-gram F-score) described in
  18. - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
  19. In Proceedings of the 10th Workshop on Machine Translation.
  20. http://www.statmt.org/wmt15/pdf/WMT49.pdf
  21. - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
  22. In Proceedings of the 1st Conference on Machine Translation.
  23. http://www.statmt.org/wmt16/pdf/W16-2341.pdf
  24. This implementation of CHRF only supports a single reference at the moment.
  25. For details not reported in the paper, consult Maja Popovic's original
  26. implementation: https://github.com/m-popovic/chrF
  27. The code should output results equivalent to running CHRF++ with the
  28. following options: -nw 0 -b 3
  29. An example from the original BLEU paper
  30. http://www.aclweb.org/anthology/P02-1040.pdf
  31. >>> ref1 = str('It is a guide to action that ensures that the military '
  32. ... 'will forever heed Party commands').split()
  33. >>> hyp1 = str('It is a guide to action which ensures that the military '
  34. ... 'always obeys the commands of the party').split()
  35. >>> hyp2 = str('It is to insure the troops forever hearing the activity '
  36. ... 'guidebook that party direct').split()
  37. >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
  38. 0.6349...
  39. >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
  40. 0.3330...
  41. The infamous "the the the ... " example
  42. >>> ref = 'the cat is on the mat'.split()
  43. >>> hyp = 'the the the the the the the'.split()
  44. >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
  45. 0.1468...
  46. An example to show that this function allows users to use strings instead of
  47. tokens, i.e. list(str) as inputs.
  48. >>> ref1 = str('It is a guide to action that ensures that the military '
  49. ... 'will forever heed Party commands')
  50. >>> hyp1 = str('It is a guide to action which ensures that the military '
  51. ... 'always obeys the commands of the party')
  52. >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
  53. 0.6349...
  54. >>> type(ref1) == type(hyp1) == str
  55. True
  56. >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
  57. 0.6349...
  58. To skip the unigrams and only use 2- to 3-grams:
  59. >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
  60. 0.6617...
  61. :param references: reference sentence
  62. :type references: list(str) / str
  63. :param hypothesis: a hypothesis sentence
  64. :type hypothesis: list(str) / str
  65. :param min_len: The minimum order of n-gram this function should extract.
  66. :type min_len: int
  67. :param max_len: The maximum order of n-gram this function should extract.
  68. :type max_len: int
  69. :param beta: the parameter to assign more importance to recall over precision
  70. :type beta: float
  71. :param ignore_whitespace: ignore whitespace characters in scoring
  72. :type ignore_whitespace: bool
  73. :return: the sentence level CHRF score.
  74. :rtype: float
  75. """
  76. return corpus_chrf(
  77. [reference],
  78. [hypothesis],
  79. min_len,
  80. max_len,
  81. beta=beta,
  82. ignore_whitespace=ignore_whitespace,
  83. )
  84. def _preprocess(sent, ignore_whitespace):
  85. if type(sent) != str:
  86. # turn list of tokens into a string
  87. sent = " ".join(sent)
  88. if ignore_whitespace:
  89. sent = re.sub(r"\s+", "", sent)
  90. return sent
  91. def chrf_precision_recall_fscore_support(
  92. reference, hypothesis, n, beta=3.0, epsilon=1e-16
  93. ):
  94. """
  95. This function computes the precision, recall and fscore from the ngram
  96. overlaps. It returns the `support` which is the true positive score.
  97. By underspecifying the input type, the function will be agnostic as to how
  98. it computes the ngrams and simply take the whichever element in the list;
  99. it could be either token or character.
  100. :param reference: The reference sentence.
  101. :type reference: list
  102. :param hypothesis: The hypothesis sentence.
  103. :type hypothesis: list
  104. :param n: Extract up to the n-th order ngrams
  105. :type n: int
  106. :param beta: The parameter to assign more importance to recall over precision.
  107. :type beta: float
  108. :param epsilon: The fallback value if the hypothesis or reference is empty.
  109. :type epsilon: float
  110. :return: Returns the precision, recall and f-score and support (true positive).
  111. :rtype: tuple(float)
  112. """
  113. ref_ngrams = Counter(ngrams(reference, n))
  114. hyp_ngrams = Counter(ngrams(hypothesis, n))
  115. # calculate the number of ngram matches
  116. overlap_ngrams = ref_ngrams & hyp_ngrams
  117. tp = sum(overlap_ngrams.values()) # True positives.
  118. tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
  119. tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
  120. try:
  121. prec = tp / tpfp # precision
  122. rec = tp / tpfn # recall
  123. factor = beta ** 2
  124. fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
  125. except ZeroDivisionError:
  126. prec = rec = fscore = epsilon
  127. return prec, rec, fscore, tp
  128. def corpus_chrf(
  129. references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
  130. ):
  131. """
  132. Calculates the corpus level CHRF (Character n-gram F-score), it is the
  133. macro-averaged value of the sentence/segment level CHRF score.
  134. This implementation of CHRF only supports a single reference at the moment.
  135. >>> ref1 = str('It is a guide to action that ensures that the military '
  136. ... 'will forever heed Party commands').split()
  137. >>> ref2 = str('It is the guiding principle which guarantees the military '
  138. ... 'forces always being under the command of the Party').split()
  139. >>>
  140. >>> hyp1 = str('It is a guide to action which ensures that the military '
  141. ... 'always obeys the commands of the party').split()
  142. >>> hyp2 = str('It is to insure the troops forever hearing the activity '
  143. ... 'guidebook that party direct')
  144. >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
  145. 0.3910...
  146. :param references: a corpus of list of reference sentences, w.r.t. hypotheses
  147. :type references: list(list(str))
  148. :param hypotheses: a list of hypothesis sentences
  149. :type hypotheses: list(list(str))
  150. :param min_len: The minimum order of n-gram this function should extract.
  151. :type min_len: int
  152. :param max_len: The maximum order of n-gram this function should extract.
  153. :type max_len: int
  154. :param beta: the parameter to assign more importance to recall over precision
  155. :type beta: float
  156. :param ignore_whitespace: ignore whitespace characters in scoring
  157. :type ignore_whitespace: bool
  158. :return: the sentence level CHRF score.
  159. :rtype: float
  160. """
  161. assert len(references) == len(
  162. hypotheses
  163. ), "The number of hypotheses and their references should be the same"
  164. num_sents = len(hypotheses)
  165. # Keep f-scores for each n-gram order separate
  166. ngram_fscores = defaultdict(lambda: list())
  167. # Iterate through each hypothesis and their corresponding references.
  168. for reference, hypothesis in zip(references, hypotheses):
  169. # preprocess both reference and hypothesis
  170. reference = _preprocess(reference, ignore_whitespace)
  171. hypothesis = _preprocess(hypothesis, ignore_whitespace)
  172. # Calculate f-scores for each sentence and for each n-gram order
  173. # separately.
  174. for n in range(min_len, max_len + 1):
  175. # Compute the precision, recall, fscore and support.
  176. prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
  177. reference, hypothesis, n, beta=beta
  178. )
  179. ngram_fscores[n].append(fscore)
  180. # how many n-gram sizes
  181. num_ngram_sizes = len(ngram_fscores)
  182. # sum of f-scores over all sentences for each n-gram order
  183. total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
  184. # macro-average over n-gram orders and over all sentences
  185. return (sum(total_scores) / num_ngram_sizes) / num_sents