metrics.doctest 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. .. Copyright (C) 2001-2020 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. =======
  4. Metrics
  5. =======
  6. The `nltk.metrics` package provides a variety of *evaluation measures*
  7. which can be used for a wide variety of NLP tasks.
  8. >>> from nltk.metrics import *
  9. ------------------
  10. Standard IR Scores
  11. ------------------
  12. We can use standard scores from information retrieval to test the
  13. performance of taggers, chunkers, etc.
  14. >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
  15. >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
  16. >>> print(accuracy(reference, test))
  17. 0.8
  18. The following measures apply to sets:
  19. >>> reference_set = set(reference)
  20. >>> test_set = set(test)
  21. >>> precision(reference_set, test_set)
  22. 1.0
  23. >>> print(recall(reference_set, test_set))
  24. 0.8
  25. >>> print(f_measure(reference_set, test_set))
  26. 0.88888888888...
  27. Measuring the likelihood of the data, given probability distributions:
  28. >>> from nltk import FreqDist, MLEProbDist
  29. >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf"))
  30. >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss"))
  31. >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2]))
  32. -2.7075187496...
  33. ----------------
  34. Distance Metrics
  35. ----------------
  36. String edit distance (Levenshtein):
  37. >>> edit_distance("rain", "shine")
  38. 3
  39. >>> edit_distance_align("shine", "shine")
  40. [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
  41. >>> edit_distance_align("rain", "brainy")
  42. [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)]
  43. >>> edit_distance_align("", "brainy")
  44. [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)]
  45. >>> edit_distance_align("", "")
  46. [(0, 0)]
  47. Other distance measures:
  48. >>> s1 = set([1,2,3,4])
  49. >>> s2 = set([3,4,5])
  50. >>> binary_distance(s1, s2)
  51. 1.0
  52. >>> print(jaccard_distance(s1, s2))
  53. 0.6
  54. >>> print(masi_distance(s1, s2))
  55. 0.868
  56. ----------------------
  57. Miscellaneous Measures
  58. ----------------------
  59. Rank Correlation works with two dictionaries mapping keys to ranks.
  60. The dictionaries should have the same set of keys.
  61. >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3})
  62. 0.5
  63. Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings).
  64. Segmentations are represented using strings of zeros and ones.
  65. >>> s1 = "000100000010"
  66. >>> s2 = "000010000100"
  67. >>> s3 = "100000010000"
  68. >>> s4 = "000000000000"
  69. >>> s5 = "111111111111"
  70. >>> windowdiff(s1, s1, 3)
  71. 0.0
  72. >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3
  73. True
  74. >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8
  75. True
  76. >>> windowdiff(s1, s4, 3)
  77. 0.5
  78. >>> windowdiff(s1, s5, 3)
  79. 1.0
  80. ----------------
  81. Confusion Matrix
  82. ----------------
  83. >>> reference = 'This is the reference data. Testing 123. aoaeoeoe'
  84. >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe'
  85. >>> print(ConfusionMatrix(reference, test))
  86. | . 1 2 3 T _ a c d e f g h i n o r s t z |
  87. --+-------------------------------------------+
  88. |<8>. . . . . 1 . . . . . . . . . . . . . . |
  89. . | .<2>. . . . . . . . . . . . . . . . . . . |
  90. 1 | . .<1>. . . . . . . . . . . . . . . . . . |
  91. 2 | . . .<1>. . . . . . . . . . . . . . . . . |
  92. 3 | . . . .<1>. . . . . . . . . . . . . . . . |
  93. T | . . . . .<2>. . . . . . . . . . . . . . . |
  94. _ | . . . . . .<.>. . . . . . . . . . . . . . |
  95. a | . . . . . . .<4>. . . . . . . . . . . . . |
  96. c | . . . . . . . .<1>. . . . . . . . . . . . |
  97. d | . . . . . . . . .<1>. . . . . . . . . . . |
  98. e | . . . . . . . . . .<6>. . . 3 . . . . . . |
  99. f | . . . . . . . . . . .<1>. . . . . . . . . |
  100. g | . . . . . . . . . . . .<1>. . . . . . . . |
  101. h | . . . . . . . . . . . . .<2>. . . . . . . |
  102. i | . . . . . . . . . . 1 . . .<1>. 1 . . . . |
  103. n | . . . . . . . . . . . . . . .<2>. . . . . |
  104. o | . . . . . . . . . . . . . . . .<3>. . . . |
  105. r | . . . . . . . . . . . . . . . . .<2>. . . |
  106. s | . . . . . . . . . . . . . . . . . .<2>. 1 |
  107. t | . . . . . . . . . . . . . . . . . . .<3>. |
  108. z | . . . . . . . . . . . . . . . . . . . .<.>|
  109. --+-------------------------------------------+
  110. (row = reference; col = test)
  111. <BLANKLINE>
  112. >>> cm = ConfusionMatrix(reference, test)
  113. >>> print(cm.pretty_format(sort_by_count=True))
  114. | e a i o s t . T h n r 1 2 3 c d f g _ z |
  115. --+-------------------------------------------+
  116. |<8>. . . . . . . . . . . . . . . . . . 1 . |
  117. e | .<6>. 3 . . . . . . . . . . . . . . . . . |
  118. a | . .<4>. . . . . . . . . . . . . . . . . . |
  119. i | . 1 .<1>1 . . . . . . . . . . . . . . . . |
  120. o | . . . .<3>. . . . . . . . . . . . . . . . |
  121. s | . . . . .<2>. . . . . . . . . . . . . . 1 |
  122. t | . . . . . .<3>. . . . . . . . . . . . . . |
  123. . | . . . . . . .<2>. . . . . . . . . . . . . |
  124. T | . . . . . . . .<2>. . . . . . . . . . . . |
  125. h | . . . . . . . . .<2>. . . . . . . . . . . |
  126. n | . . . . . . . . . .<2>. . . . . . . . . . |
  127. r | . . . . . . . . . . .<2>. . . . . . . . . |
  128. 1 | . . . . . . . . . . . .<1>. . . . . . . . |
  129. 2 | . . . . . . . . . . . . .<1>. . . . . . . |
  130. 3 | . . . . . . . . . . . . . .<1>. . . . . . |
  131. c | . . . . . . . . . . . . . . .<1>. . . . . |
  132. d | . . . . . . . . . . . . . . . .<1>. . . . |
  133. f | . . . . . . . . . . . . . . . . .<1>. . . |
  134. g | . . . . . . . . . . . . . . . . . .<1>. . |
  135. _ | . . . . . . . . . . . . . . . . . . .<.>. |
  136. z | . . . . . . . . . . . . . . . . . . . .<.>|
  137. --+-------------------------------------------+
  138. (row = reference; col = test)
  139. <BLANKLINE>
  140. >>> print(cm.pretty_format(sort_by_count=True, truncate=10))
  141. | e a i o s t . T h |
  142. --+---------------------+
  143. |<8>. . . . . . . . . |
  144. e | .<6>. 3 . . . . . . |
  145. a | . .<4>. . . . . . . |
  146. i | . 1 .<1>1 . . . . . |
  147. o | . . . .<3>. . . . . |
  148. s | . . . . .<2>. . . . |
  149. t | . . . . . .<3>. . . |
  150. . | . . . . . . .<2>. . |
  151. T | . . . . . . . .<2>. |
  152. h | . . . . . . . . .<2>|
  153. --+---------------------+
  154. (row = reference; col = test)
  155. <BLANKLINE>
  156. >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False))
  157. | 1 |
  158. | 1 2 3 4 5 6 7 8 9 0 |
  159. ---+---------------------+
  160. 1 |<8>. . . . . . . . . |
  161. 2 | .<6>. 3 . . . . . . |
  162. 3 | . .<4>. . . . . . . |
  163. 4 | . 1 .<1>1 . . . . . |
  164. 5 | . . . .<3>. . . . . |
  165. 6 | . . . . .<2>. . . . |
  166. 7 | . . . . . .<3>. . . |
  167. 8 | . . . . . . .<2>. . |
  168. 9 | . . . . . . . .<2>. |
  169. 10 | . . . . . . . . .<2>|
  170. ---+---------------------+
  171. (row = reference; col = test)
  172. Value key:
  173. 1:
  174. 2: e
  175. 3: a
  176. 4: i
  177. 5: o
  178. 6: s
  179. 7: t
  180. 8: .
  181. 9: T
  182. 10: h
  183. <BLANKLINE>
  184. --------------------
  185. Association measures
  186. --------------------
  187. These measures are useful to determine whether the coocurrence of two random
  188. events is meaningful. They are used, for instance, to distinguish collocations
  189. from other pairs of adjacent words.
  190. We bring some examples of bigram association calculations from Manning and
  191. Schutze's SNLP, 2nd Ed. chapter 5.
  192. >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668
  193. >>> bam = BigramAssocMeasures
  194. >>> bam.raw_freq(20, (42, 20), N) == 20. / N
  195. True
  196. >>> bam.student_t(n_new_companies, (n_new, n_companies), N)
  197. 0.999...
  198. >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N)
  199. 1.54...
  200. >>> bam.likelihood_ratio(150, (12593, 932), N)
  201. 1291...
  202. For other associations, we ensure the ordering of the measures:
  203. >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N)
  204. True
  205. >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N)
  206. True
  207. >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N)
  208. True
  209. >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N)
  210. True
  211. >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N)
  212. True
  213. >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
  214. True
  215. >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
  216. False
  217. For trigrams, we have to provide more count information:
  218. >>> n_w1_w2_w3 = 20
  219. >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
  220. >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
  221. >>> n_w1, n_w2, n_w3 = 100, 200, 300
  222. >>> uni_counts = (n_w1, n_w2, n_w3)
  223. >>> N = 14307668
  224. >>> tam = TrigramAssocMeasures
  225. >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N
  226. True
  227. >>> uni_counts2 = (n_w1, n_w2, 100)
  228. >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N)
  229. True
  230. >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N)
  231. True
  232. >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N)
  233. True
  234. >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N)
  235. True
  236. >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N)
  237. True
  238. >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N)
  239. True
  240. >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N)
  241. True
  242. For fourgrams, we have to provide more count information:
  243. >>> n_w1_w2_w3_w4 = 5
  244. >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
  245. >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10
  246. >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
  247. >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4)
  248. >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400
  249. >>> uni_counts = (n_w1, n_w2, n_w3, n_w4)
  250. >>> N = 14307668
  251. >>> qam = QuadgramAssocMeasures
  252. >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N
  253. True