em.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # Natural Language Toolkit: Expectation Maximization Clusterer
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. try:
  8. import numpy
  9. except ImportError:
  10. pass
  11. from nltk.cluster.util import VectorSpaceClusterer
  12. class EMClusterer(VectorSpaceClusterer):
  13. """
  14. The Gaussian EM clusterer models the vectors as being produced by
  15. a mixture of k Gaussian sources. The parameters of these sources
  16. (prior probability, mean and covariance matrix) are then found to
  17. maximise the likelihood of the given data. This is done with the
  18. expectation maximisation algorithm. It starts with k arbitrarily
  19. chosen means, priors and covariance matrices. It then calculates
  20. the membership probabilities for each vector in each of the
  21. clusters; this is the 'E' step. The cluster parameters are then
  22. updated in the 'M' step using the maximum likelihood estimate from
  23. the cluster membership probabilities. This process continues until
  24. the likelihood of the data does not significantly increase.
  25. """
  26. def __init__(
  27. self,
  28. initial_means,
  29. priors=None,
  30. covariance_matrices=None,
  31. conv_threshold=1e-6,
  32. bias=0.1,
  33. normalise=False,
  34. svd_dimensions=None,
  35. ):
  36. """
  37. Creates an EM clusterer with the given starting parameters,
  38. convergence threshold and vector mangling parameters.
  39. :param initial_means: the means of the gaussian cluster centers
  40. :type initial_means: [seq of] numpy array or seq of SparseArray
  41. :param priors: the prior probability for each cluster
  42. :type priors: numpy array or seq of float
  43. :param covariance_matrices: the covariance matrix for each cluster
  44. :type covariance_matrices: [seq of] numpy array
  45. :param conv_threshold: maximum change in likelihood before deemed
  46. convergent
  47. :type conv_threshold: int or float
  48. :param bias: variance bias used to ensure non-singular covariance
  49. matrices
  50. :type bias: float
  51. :param normalise: should vectors be normalised to length 1
  52. :type normalise: boolean
  53. :param svd_dimensions: number of dimensions to use in reducing vector
  54. dimensionsionality with SVD
  55. :type svd_dimensions: int
  56. """
  57. VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
  58. self._means = numpy.array(initial_means, numpy.float64)
  59. self._num_clusters = len(initial_means)
  60. self._conv_threshold = conv_threshold
  61. self._covariance_matrices = covariance_matrices
  62. self._priors = priors
  63. self._bias = bias
  64. def num_clusters(self):
  65. return self._num_clusters
  66. def cluster_vectorspace(self, vectors, trace=False):
  67. assert len(vectors) > 0
  68. # set the parameters to initial values
  69. dimensions = len(vectors[0])
  70. means = self._means
  71. priors = self._priors
  72. if not priors:
  73. priors = self._priors = (
  74. numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
  75. )
  76. covariances = self._covariance_matrices
  77. if not covariances:
  78. covariances = self._covariance_matrices = [
  79. numpy.identity(dimensions, numpy.float64)
  80. for i in range(self._num_clusters)
  81. ]
  82. # do the E and M steps until the likelihood plateaus
  83. lastl = self._loglikelihood(vectors, priors, means, covariances)
  84. converged = False
  85. while not converged:
  86. if trace:
  87. print("iteration; loglikelihood", lastl)
  88. # E-step, calculate hidden variables, h[i,j]
  89. h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
  90. for i in range(len(vectors)):
  91. for j in range(self._num_clusters):
  92. h[i, j] = priors[j] * self._gaussian(
  93. means[j], covariances[j], vectors[i]
  94. )
  95. h[i, :] /= sum(h[i, :])
  96. # M-step, update parameters - cvm, p, mean
  97. for j in range(self._num_clusters):
  98. covariance_before = covariances[j]
  99. new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
  100. new_mean = numpy.zeros(dimensions, numpy.float64)
  101. sum_hj = 0.0
  102. for i in range(len(vectors)):
  103. delta = vectors[i] - means[j]
  104. new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
  105. sum_hj += h[i, j]
  106. new_mean += h[i, j] * vectors[i]
  107. covariances[j] = new_covariance / sum_hj
  108. means[j] = new_mean / sum_hj
  109. priors[j] = sum_hj / len(vectors)
  110. # bias term to stop covariance matrix being singular
  111. covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
  112. # calculate likelihood - FIXME: may be broken
  113. l = self._loglikelihood(vectors, priors, means, covariances)
  114. # check for convergence
  115. if abs(lastl - l) < self._conv_threshold:
  116. converged = True
  117. lastl = l
  118. def classify_vectorspace(self, vector):
  119. best = None
  120. for j in range(self._num_clusters):
  121. p = self._priors[j] * self._gaussian(
  122. self._means[j], self._covariance_matrices[j], vector
  123. )
  124. if not best or p > best[0]:
  125. best = (p, j)
  126. return best[1]
  127. def likelihood_vectorspace(self, vector, cluster):
  128. cid = self.cluster_names().index(cluster)
  129. return self._priors[cluster] * self._gaussian(
  130. self._means[cluster], self._covariance_matrices[cluster], vector
  131. )
  132. def _gaussian(self, mean, cvm, x):
  133. m = len(mean)
  134. assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
  135. try:
  136. det = numpy.linalg.det(cvm)
  137. inv = numpy.linalg.inv(cvm)
  138. a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
  139. dx = x - mean
  140. print(dx, inv)
  141. b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
  142. return a * numpy.exp(b)
  143. except OverflowError:
  144. # happens when the exponent is negative infinity - i.e. b = 0
  145. # i.e. the inverse of cvm is huge (cvm is almost zero)
  146. return 0
  147. def _loglikelihood(self, vectors, priors, means, covariances):
  148. llh = 0.0
  149. for vector in vectors:
  150. p = 0
  151. for j in range(len(priors)):
  152. p += priors[j] * self._gaussian(means[j], covariances[j], vector)
  153. llh += numpy.log(p)
  154. return llh
  155. def __repr__(self):
  156. return "<EMClusterer means=%s>" % list(self._means)
  157. def demo():
  158. """
  159. Non-interactive demonstration of the clusterers with simple 2-D data.
  160. """
  161. from nltk import cluster
  162. # example from figure 14.10, page 519, Manning and Schutze
  163. vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
  164. means = [[4, 2], [4, 2.01]]
  165. clusterer = cluster.EMClusterer(means, bias=0.1)
  166. clusters = clusterer.cluster(vectors, True, trace=True)
  167. print("Clustered:", vectors)
  168. print("As: ", clusters)
  169. print()
  170. for c in range(2):
  171. print("Cluster:", c)
  172. print("Prior: ", clusterer._priors[c])
  173. print("Mean: ", clusterer._means[c])
  174. print("Covar: ", clusterer._covariance_matrices[c])
  175. print()
  176. # classify a new vector
  177. vector = numpy.array([2, 2])
  178. print("classify(%s):" % vector, end=" ")
  179. print(clusterer.classify(vector))
  180. # show the classification probabilities
  181. vector = numpy.array([2, 2])
  182. print("classification_probdist(%s):" % vector)
  183. pdist = clusterer.classification_probdist(vector)
  184. for sample in pdist.samples():
  185. print("%s => %.0f%%" % (sample, pdist.prob(sample) * 100))
  186. if __name__ == "__main__":
  187. demo()