scores.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. # Natural Language Toolkit: Evaluation
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from math import fabs
  9. import operator
  10. from random import shuffle
  11. from functools import reduce
  12. try:
  13. from scipy.stats.stats import betai
  14. except ImportError:
  15. betai = None
  16. from nltk.util import LazyConcatenation, LazyMap
  17. def accuracy(reference, test):
  18. """
  19. Given a list of reference values and a corresponding list of test
  20. values, return the fraction of corresponding values that are
  21. equal. In particular, return the fraction of indices
  22. ``0<i<=len(test)`` such that ``test[i] == reference[i]``.
  23. :type reference: list
  24. :param reference: An ordered list of reference values.
  25. :type test: list
  26. :param test: A list of values to compare against the corresponding
  27. reference values.
  28. :raise ValueError: If ``reference`` and ``length`` do not have the
  29. same length.
  30. """
  31. if len(reference) != len(test):
  32. raise ValueError("Lists must have the same length.")
  33. return sum(x == y for x, y in zip(reference, test)) / len(test)
  34. def precision(reference, test):
  35. """
  36. Given a set of reference values and a set of test values, return
  37. the fraction of test values that appear in the reference set.
  38. In particular, return card(``reference`` intersection ``test``)/card(``test``).
  39. If ``test`` is empty, then return None.
  40. :type reference: set
  41. :param reference: A set of reference values.
  42. :type test: set
  43. :param test: A set of values to compare against the reference set.
  44. :rtype: float or None
  45. """
  46. if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
  47. raise TypeError("reference and test should be sets")
  48. if len(test) == 0:
  49. return None
  50. else:
  51. return len(reference.intersection(test)) / len(test)
  52. def recall(reference, test):
  53. """
  54. Given a set of reference values and a set of test values, return
  55. the fraction of reference values that appear in the test set.
  56. In particular, return card(``reference`` intersection ``test``)/card(``reference``).
  57. If ``reference`` is empty, then return None.
  58. :type reference: set
  59. :param reference: A set of reference values.
  60. :type test: set
  61. :param test: A set of values to compare against the reference set.
  62. :rtype: float or None
  63. """
  64. if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
  65. raise TypeError("reference and test should be sets")
  66. if len(reference) == 0:
  67. return None
  68. else:
  69. return len(reference.intersection(test)) / len(reference)
  70. def f_measure(reference, test, alpha=0.5):
  71. """
  72. Given a set of reference values and a set of test values, return
  73. the f-measure of the test values, when compared against the
  74. reference values. The f-measure is the harmonic mean of the
  75. ``precision`` and ``recall``, weighted by ``alpha``. In particular,
  76. given the precision *p* and recall *r* defined by:
  77. - *p* = card(``reference`` intersection ``test``)/card(``test``)
  78. - *r* = card(``reference`` intersection ``test``)/card(``reference``)
  79. The f-measure is:
  80. - *1/(alpha/p + (1-alpha)/r)*
  81. If either ``reference`` or ``test`` is empty, then ``f_measure``
  82. returns None.
  83. :type reference: set
  84. :param reference: A set of reference values.
  85. :type test: set
  86. :param test: A set of values to compare against the reference set.
  87. :rtype: float or None
  88. """
  89. p = precision(reference, test)
  90. r = recall(reference, test)
  91. if p is None or r is None:
  92. return None
  93. if p == 0 or r == 0:
  94. return 0
  95. return 1.0 / (alpha / p + (1 - alpha) / r)
  96. def log_likelihood(reference, test):
  97. """
  98. Given a list of reference values and a corresponding list of test
  99. probability distributions, return the average log likelihood of
  100. the reference values, given the probability distributions.
  101. :param reference: A list of reference values
  102. :type reference: list
  103. :param test: A list of probability distributions over values to
  104. compare against the corresponding reference values.
  105. :type test: list(ProbDistI)
  106. """
  107. if len(reference) != len(test):
  108. raise ValueError("Lists must have the same length.")
  109. # Return the average value of dist.logprob(val).
  110. total_likelihood = sum(dist.logprob(val) for (val, dist) in zip(reference, test))
  111. return total_likelihood / len(reference)
  112. def approxrand(a, b, **kwargs):
  113. """
  114. Returns an approximate significance level between two lists of
  115. independently generated test values.
  116. Approximate randomization calculates significance by randomly drawing
  117. from a sample of the possible permutations. At the limit of the number
  118. of possible permutations, the significance level is exact. The
  119. approximate significance level is the sample mean number of times the
  120. statistic of the permutated lists varies from the actual statistic of
  121. the unpermuted argument lists.
  122. :return: a tuple containing an approximate significance level, the count
  123. of the number of times the pseudo-statistic varied from the
  124. actual statistic, and the number of shuffles
  125. :rtype: tuple
  126. :param a: a list of test values
  127. :type a: list
  128. :param b: another list of independently generated test values
  129. :type b: list
  130. """
  131. shuffles = kwargs.get("shuffles", 999)
  132. # there's no point in trying to shuffle beyond all possible permutations
  133. shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
  134. stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
  135. verbose = kwargs.get("verbose", False)
  136. if verbose:
  137. print("shuffles: %d" % shuffles)
  138. actual_stat = fabs(stat(a) - stat(b))
  139. if verbose:
  140. print("actual statistic: %f" % actual_stat)
  141. print("-" * 60)
  142. c = 1e-100
  143. lst = LazyConcatenation([a, b])
  144. indices = list(range(len(a) + len(b)))
  145. for i in range(shuffles):
  146. if verbose and i % 10 == 0:
  147. print("shuffle: %d" % i)
  148. shuffle(indices)
  149. pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
  150. pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
  151. pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
  152. if pseudo_stat >= actual_stat:
  153. c += 1
  154. if verbose and i % 10 == 0:
  155. print("pseudo-statistic: %f" % pseudo_stat)
  156. print("significance: %f" % ((c + 1) / (i + 1)))
  157. print("-" * 60)
  158. significance = (c + 1) / (shuffles + 1)
  159. if verbose:
  160. print("significance: %f" % significance)
  161. if betai:
  162. for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
  163. print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))
  164. return (significance, c, shuffles)
  165. def demo():
  166. print("-" * 75)
  167. reference = "DET NN VB DET JJ NN NN IN DET NN".split()
  168. test = "DET VB VB DET NN NN NN IN DET NN".split()
  169. print("Reference =", reference)
  170. print("Test =", test)
  171. print("Accuracy:", accuracy(reference, test))
  172. print("-" * 75)
  173. reference_set = set(reference)
  174. test_set = set(test)
  175. print("Reference =", reference_set)
  176. print("Test = ", test_set)
  177. print("Precision:", precision(reference_set, test_set))
  178. print(" Recall:", recall(reference_set, test_set))
  179. print("F-Measure:", f_measure(reference_set, test_set))
  180. print("-" * 75)
  181. if __name__ == "__main__":
  182. demo()