ribes_score.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: RIBES Score
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
  6. # Mark Byers, ekhumoro, P. Ortiz
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """ RIBES score implementation """
  10. from itertools import islice
  11. import math
  12. from nltk.util import ngrams, choose
  13. def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
  14. """
  15. The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
  16. Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
  17. Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
  18. Distant Language Pairs". In Proceedings of EMNLP.
  19. http://www.aclweb.org/anthology/D/D10/D10-1092.pdf
  20. The generic RIBES scores used in shared task, e.g. Workshop for
  21. Asian Translation (WAT) uses the following RIBES calculations:
  22. RIBES = kendall_tau * (alpha**p1) * (beta**bp)
  23. Please note that this re-implementation differs from the official
  24. RIBES implementation and though it emulates the results as describe
  25. in the original paper, there are further optimization implemented
  26. in the official RIBES script.
  27. Users are encouraged to use the official RIBES script instead of this
  28. implementation when evaluating your machine translation system. Refer
  29. to http://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
  30. :param references: a list of reference sentences
  31. :type reference: list(list(str))
  32. :param hypothesis: a hypothesis sentence
  33. :type hypothesis: list(str)
  34. :param alpha: hyperparameter used as a prior for the unigram precision.
  35. :type alpha: float
  36. :param beta: hyperparameter used as a prior for the brevity penalty.
  37. :type beta: float
  38. :return: The best ribes score from one of the references.
  39. :rtype: float
  40. """
  41. best_ribes = -1.0
  42. # Calculates RIBES for each reference and returns the best score.
  43. for reference in references:
  44. # Collects the *worder* from the ranked correlation alignments.
  45. worder = word_rank_alignment(reference, hypothesis)
  46. nkt = kendall_tau(worder)
  47. # Calculates the brevity penalty
  48. bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
  49. # Calculates the unigram precision, *p1*
  50. p1 = len(worder) / len(hypothesis)
  51. _ribes = nkt * (p1 ** alpha) * (bp ** beta)
  52. if _ribes > best_ribes: # Keeps the best score.
  53. best_ribes = _ribes
  54. return best_ribes
  55. def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
  56. """
  57. This function "calculates RIBES for a system output (hypothesis) with
  58. multiple references, and returns "best" score among multi-references and
  59. individual scores. The scores are corpus-wise, i.e., averaged by the number
  60. of sentences." (c.f. RIBES version 1.03.1 code).
  61. Different from BLEU's micro-average precision, RIBES calculates the
  62. macro-average precision by averaging the best RIBES score for each pair of
  63. hypothesis and its corresponding references
  64. >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
  65. ... 'ensures', 'that', 'the', 'military', 'always',
  66. ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
  67. >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
  68. ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
  69. ... 'heed', 'Party', 'commands']
  70. >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
  71. ... 'guarantees', 'the', 'military', 'forces', 'always',
  72. ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
  73. >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
  74. ... 'army', 'always', 'to', 'heed', 'the', 'directions',
  75. ... 'of', 'the', 'party']
  76. >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
  77. ... 'interested', 'in', 'world', 'history']
  78. >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
  79. ... 'because', 'he', 'read', 'the', 'book']
  80. >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
  81. >>> hypotheses = [hyp1, hyp2]
  82. >>> round(corpus_ribes(list_of_references, hypotheses),4)
  83. 0.3597
  84. :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
  85. :type references: list(list(list(str)))
  86. :param hypotheses: a list of hypothesis sentences
  87. :type hypotheses: list(list(str))
  88. :param alpha: hyperparameter used as a prior for the unigram precision.
  89. :type alpha: float
  90. :param beta: hyperparameter used as a prior for the brevity penalty.
  91. :type beta: float
  92. :return: The best ribes score from one of the references.
  93. :rtype: float
  94. """
  95. corpus_best_ribes = 0.0
  96. # Iterate through each hypothesis and their corresponding references.
  97. for references, hypothesis in zip(list_of_references, hypotheses):
  98. corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
  99. return corpus_best_ribes / len(hypotheses)
  100. def position_of_ngram(ngram, sentence):
  101. """
  102. This function returns the position of the first instance of the ngram
  103. appearing in a sentence.
  104. Note that one could also use string as follows but the code is a little
  105. convoluted with type casting back and forth:
  106. char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
  107. word_pos = char_pos.count(' ')
  108. Another way to conceive this is:
  109. return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
  110. if ng == ngram)
  111. :param ngram: The ngram that needs to be searched
  112. :type ngram: tuple
  113. :param sentence: The list of tokens to search from.
  114. :type sentence: list(str)
  115. """
  116. # Iterates through the ngrams in sentence.
  117. for i, sublist in enumerate(ngrams(sentence, len(ngram))):
  118. # Returns the index of the word when ngram matches.
  119. if ngram == sublist:
  120. return i
  121. def word_rank_alignment(reference, hypothesis, character_based=False):
  122. """
  123. This is the word rank alignment algorithm described in the paper to produce
  124. the *worder* list, i.e. a list of word indices of the hypothesis word orders
  125. w.r.t. the list of reference words.
  126. Below is (H0, R0) example from the Isozaki et al. 2010 paper,
  127. note the examples are indexed from 1 but the results here are indexed from 0:
  128. >>> ref = str('he was interested in world history because he '
  129. ... 'read the book').split()
  130. >>> hyp = str('he read the book because he was interested in world '
  131. ... 'history').split()
  132. >>> word_rank_alignment(ref, hyp)
  133. [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
  134. The (H1, R1) example from the paper, note the 0th index:
  135. >>> ref = 'John hit Bob yesterday'.split()
  136. >>> hyp = 'Bob hit John yesterday'.split()
  137. >>> word_rank_alignment(ref, hyp)
  138. [2, 1, 0, 3]
  139. Here is the (H2, R2) example from the paper, note the 0th index here too:
  140. >>> ref = 'the boy read the book'.split()
  141. >>> hyp = 'the book was read by the boy'.split()
  142. >>> word_rank_alignment(ref, hyp)
  143. [3, 4, 2, 0, 1]
  144. :param reference: a reference sentence
  145. :type reference: list(str)
  146. :param hypothesis: a hypothesis sentence
  147. :type hypothesis: list(str)
  148. """
  149. worder = []
  150. hyp_len = len(hypothesis)
  151. # Stores a list of possible ngrams from the reference sentence.
  152. # This is used for matching context window later in the algorithm.
  153. ref_ngrams = []
  154. hyp_ngrams = []
  155. for n in range(1, len(reference) + 1):
  156. for ng in ngrams(reference, n):
  157. ref_ngrams.append(ng)
  158. for ng in ngrams(hypothesis, n):
  159. hyp_ngrams.append(ng)
  160. for i, h_word in enumerate(hypothesis):
  161. # If word is not in the reference, continue.
  162. if h_word not in reference:
  163. continue
  164. # If we can determine one-to-one word correspondence for unigrams that
  165. # only appear once in both the reference and hypothesis.
  166. elif hypothesis.count(h_word) == reference.count(h_word) == 1:
  167. worder.append(reference.index(h_word))
  168. else:
  169. max_window_size = max(i, hyp_len - i + 1)
  170. for window in range(1, max_window_size):
  171. if i + window < hyp_len: # If searching the right context is possible.
  172. # Retrieve the right context window.
  173. right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
  174. num_times_in_ref = ref_ngrams.count(right_context_ngram)
  175. num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
  176. # If ngram appears only once in both ref and hyp.
  177. if num_times_in_ref == num_times_in_hyp == 1:
  178. # Find the position of ngram that matched the reference.
  179. pos = position_of_ngram(right_context_ngram, reference)
  180. worder.append(pos) # Add the positions of the ngram.
  181. break
  182. if window <= i: # If searching the left context is possible.
  183. # Retrieve the left context window.
  184. left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
  185. num_times_in_ref = ref_ngrams.count(left_context_ngram)
  186. num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
  187. if num_times_in_ref == num_times_in_hyp == 1:
  188. # Find the position of ngram that matched the reference.
  189. pos = position_of_ngram(left_context_ngram, reference)
  190. # Add the positions of the ngram.
  191. worder.append(pos + len(left_context_ngram) - 1)
  192. break
  193. return worder
  194. def find_increasing_sequences(worder):
  195. """
  196. Given the *worder* list, this function groups monotonic +1 sequences.
  197. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
  198. >>> list(find_increasing_sequences(worder))
  199. [(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
  200. :param worder: The worder list output from word_rank_alignment
  201. :param type: list(int)
  202. """
  203. items = iter(worder)
  204. a, b = None, next(items, None)
  205. result = [b]
  206. while b is not None:
  207. a, b = b, next(items, None)
  208. if b is not None and a + 1 == b:
  209. result.append(b)
  210. else:
  211. if len(result) > 1:
  212. yield tuple(result)
  213. result = [b]
  214. def kendall_tau(worder, normalize=True):
  215. """
  216. Calculates the Kendall's Tau correlation coefficient given the *worder*
  217. list of word alignments from word_rank_alignment(), using the formula:
  218. tau = 2 * num_increasing_pairs / num_possible pairs -1
  219. Note that the no. of increasing pairs can be discontinuous in the *worder*
  220. list and each each increasing sequence can be tabulated as choose(len(seq), 2)
  221. no. of increasing pairs, e.g.
  222. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
  223. >>> number_possible_pairs = choose(len(worder), 2)
  224. >>> round(kendall_tau(worder, normalize=False),3)
  225. -0.236
  226. >>> round(kendall_tau(worder),3)
  227. 0.382
  228. :param worder: The worder list output from word_rank_alignment
  229. :type worder: list(int)
  230. :param normalize: Flag to indicate normalization
  231. :type normalize: boolean
  232. :return: The Kendall's Tau correlation coefficient.
  233. :rtype: float
  234. """
  235. worder_len = len(worder)
  236. # Extract the groups of increasing/monotonic sequences.
  237. increasing_sequences = find_increasing_sequences(worder)
  238. # Calculate no. of increasing_pairs in *worder* list.
  239. num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
  240. # Calculate no. of possible pairs.
  241. num_possible_pairs = choose(worder_len, 2)
  242. # Kendall's Tau computation.
  243. tau = 2 * num_increasing_pairs / num_possible_pairs - 1
  244. if normalize: # If normalized, the tau output falls between 0.0 to 1.0
  245. return (tau + 1) / 2
  246. else: # Otherwise, the tau outputs falls between -1.0 to +1.0
  247. return tau
  248. def spearman_rho(worder, normalize=True):
  249. """
  250. Calculates the Spearman's Rho correlation coefficient given the *worder*
  251. list of word alignment from word_rank_alignment(), using the formula:
  252. rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
  253. Given that d is the sum of difference between the *worder* list of indices
  254. and the original word indices from the reference sentence.
  255. Using the (H0,R0) and (H5, R5) example from the paper
  256. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
  257. >>> round(spearman_rho(worder, normalize=False), 3)
  258. -0.591
  259. >>> round(spearman_rho(worder), 3)
  260. 0.205
  261. :param worder: The worder list output from word_rank_alignment
  262. :param type: list(int)
  263. """
  264. worder_len = len(worder)
  265. sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
  266. rho = 1 - sum_d_square / choose(worder_len + 1, 3)
  267. if normalize: # If normalized, the rho output falls between 0.0 to 1.0
  268. return (rho + 1) / 2
  269. else: # Otherwise, the rho outputs falls between -1.0 to +1.0
  270. return rho