segmentation.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. # Natural Language Toolkit: Text Segmentation Metrics
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # David Doukhan <david.doukhan@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Text Segmentation Metrics
  11. 1. Windowdiff
  12. Pevzner, L., and Hearst, M., A Critique and Improvement of
  13. an Evaluation Metric for Text Segmentation,
  14. Computational Linguistics 28, 19-36
  15. 2. Generalized Hamming Distance
  16. Bookstein A., Kulyukin V.A., Raita T.
  17. Generalized Hamming Distance
  18. Information Retrieval 5, 2002, pp 353-375
  19. Baseline implementation in C++
  20. http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
  21. Study describing benefits of Generalized Hamming Distance Versus
  22. WindowDiff for evaluating text segmentation tasks
  23. Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
  24. TALN 2009
  25. 3. Pk text segmentation metric
  26. Beeferman D., Berger A., Lafferty J. (1999)
  27. Statistical Models for Text Segmentation
  28. Machine Learning, 34, 177-210
  29. """
  30. try:
  31. import numpy as np
  32. except ImportError:
  33. pass
  34. def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
  35. """
  36. Compute the windowdiff score for a pair of segmentations. A
  37. segmentation is any sequence over a vocabulary of two items
  38. (e.g. "0", "1"), where the specified boundary value is used to
  39. mark the edge of a segmentation.
  40. >>> s1 = "000100000010"
  41. >>> s2 = "000010000100"
  42. >>> s3 = "100000010000"
  43. >>> '%.2f' % windowdiff(s1, s1, 3)
  44. '0.00'
  45. >>> '%.2f' % windowdiff(s1, s2, 3)
  46. '0.30'
  47. >>> '%.2f' % windowdiff(s2, s3, 3)
  48. '0.80'
  49. :param seg1: a segmentation
  50. :type seg1: str or list
  51. :param seg2: a segmentation
  52. :type seg2: str or list
  53. :param k: window width
  54. :type k: int
  55. :param boundary: boundary value
  56. :type boundary: str or int or bool
  57. :param weighted: use the weighted variant of windowdiff
  58. :type weighted: boolean
  59. :rtype: float
  60. """
  61. if len(seg1) != len(seg2):
  62. raise ValueError("Segmentations have unequal length")
  63. if k > len(seg1):
  64. raise ValueError(
  65. "Window width k should be smaller or equal than segmentation lengths"
  66. )
  67. wd = 0
  68. for i in range(len(seg1) - k + 1):
  69. ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
  70. if weighted:
  71. wd += ndiff
  72. else:
  73. wd += min(1, ndiff)
  74. return wd / (len(seg1) - k + 1.0)
  75. # Generalized Hamming Distance
  76. def _init_mat(nrows, ncols, ins_cost, del_cost):
  77. mat = np.empty((nrows, ncols))
  78. mat[0, :] = ins_cost * np.arange(ncols)
  79. mat[:, 0] = del_cost * np.arange(nrows)
  80. return mat
  81. def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
  82. for i, rowi in enumerate(rowv):
  83. for j, colj in enumerate(colv):
  84. shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
  85. if rowi == colj:
  86. # boundaries are at the same location, no transformation required
  87. tcost = mat[i, j]
  88. elif rowi > colj:
  89. # boundary match through a deletion
  90. tcost = del_cost + mat[i, j + 1]
  91. else:
  92. # boundary match through an insertion
  93. tcost = ins_cost + mat[i + 1, j]
  94. mat[i + 1, j + 1] = min(tcost, shift_cost)
  95. def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
  96. """
  97. Compute the Generalized Hamming Distance for a reference and a hypothetical
  98. segmentation, corresponding to the cost related to the transformation
  99. of the hypothetical segmentation into the reference segmentation
  100. through boundary insertion, deletion and shift operations.
  101. A segmentation is any sequence over a vocabulary of two items
  102. (e.g. "0", "1"), where the specified boundary value is used to
  103. mark the edge of a segmentation.
  104. Recommended parameter values are a shift_cost_coeff of 2.
  105. Associated with a ins_cost, and del_cost equal to the mean segment
  106. length in the reference segmentation.
  107. >>> # Same examples as Kulyukin C++ implementation
  108. >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
  109. 0.5
  110. >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
  111. 2.0
  112. >>> ghd('011', '110', 1.0, 1.0, 0.5)
  113. 1.0
  114. >>> ghd('1', '0', 1.0, 1.0, 0.5)
  115. 1.0
  116. >>> ghd('111', '000', 1.0, 1.0, 0.5)
  117. 3.0
  118. >>> ghd('000', '111', 1.0, 2.0, 0.5)
  119. 6.0
  120. :param ref: the reference segmentation
  121. :type ref: str or list
  122. :param hyp: the hypothetical segmentation
  123. :type hyp: str or list
  124. :param ins_cost: insertion cost
  125. :type ins_cost: float
  126. :param del_cost: deletion cost
  127. :type del_cost: float
  128. :param shift_cost_coeff: constant used to compute the cost of a shift.
  129. shift cost = shift_cost_coeff * |i - j| where i and j are
  130. the positions indicating the shift
  131. :type shift_cost_coeff: float
  132. :param boundary: boundary value
  133. :type boundary: str or int or bool
  134. :rtype: float
  135. """
  136. ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
  137. hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
  138. nref_bound = len(ref_idx)
  139. nhyp_bound = len(hyp_idx)
  140. if nref_bound == 0 and nhyp_bound == 0:
  141. return 0.0
  142. elif nref_bound > 0 and nhyp_bound == 0:
  143. return nref_bound * ins_cost
  144. elif nref_bound == 0 and nhyp_bound > 0:
  145. return nhyp_bound * del_cost
  146. mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
  147. _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
  148. return mat[-1, -1]
  149. # Beeferman's Pk text segmentation evaluation metric
  150. def pk(ref, hyp, k=None, boundary="1"):
  151. """
  152. Compute the Pk metric for a pair of segmentations A segmentation
  153. is any sequence over a vocabulary of two items (e.g. "0", "1"),
  154. where the specified boundary value is used to mark the edge of a
  155. segmentation.
  156. >>> '%.2f' % pk('0100'*100, '1'*400, 2)
  157. '0.50'
  158. >>> '%.2f' % pk('0100'*100, '0'*400, 2)
  159. '0.50'
  160. >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
  161. '0.00'
  162. :param ref: the reference segmentation
  163. :type ref: str or list
  164. :param hyp: the segmentation to evaluate
  165. :type hyp: str or list
  166. :param k: window size, if None, set to half of the average reference segment length
  167. :type boundary: str or int or bool
  168. :param boundary: boundary value
  169. :type boundary: str or int or bool
  170. :rtype: float
  171. """
  172. if k is None:
  173. k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
  174. err = 0
  175. for i in range(len(ref) - k + 1):
  176. r = ref[i : i + k].count(boundary) > 0
  177. h = hyp[i : i + k].count(boundary) > 0
  178. if r != h:
  179. err += 1
  180. return err / (len(ref) - k + 1.0)
  181. # skip doctests if numpy is not installed
  182. def setup_module(module):
  183. from nose import SkipTest
  184. try:
  185. import numpy
  186. except ImportError:
  187. raise SkipTest("numpy is required for nltk.metrics.segmentation")