kmeans.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # Natural Language Toolkit: K-Means Clusterer
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import copy
  8. import random
  9. import sys
  10. try:
  11. import numpy
  12. except ImportError:
  13. pass
  14. from nltk.cluster.util import VectorSpaceClusterer
  15. class KMeansClusterer(VectorSpaceClusterer):
  16. """
  17. The K-means clusterer starts with k arbitrary chosen means then allocates
  18. each vector to the cluster with the closest mean. It then recalculates the
  19. means of each cluster as the centroid of the vectors in the cluster. This
  20. process repeats until the cluster memberships stabilise. This is a
  21. hill-climbing algorithm which may converge to a local maximum. Hence the
  22. clustering is often repeated with random initial means and the most
  23. commonly occurring output means are chosen.
  24. """
  25. def __init__(
  26. self,
  27. num_means,
  28. distance,
  29. repeats=1,
  30. conv_test=1e-6,
  31. initial_means=None,
  32. normalise=False,
  33. svd_dimensions=None,
  34. rng=None,
  35. avoid_empty_clusters=False,
  36. ):
  37. """
  38. :param num_means: the number of means to use (may use fewer)
  39. :type num_means: int
  40. :param distance: measure of distance between two vectors
  41. :type distance: function taking two vectors and returing a float
  42. :param repeats: number of randomised clustering trials to use
  43. :type repeats: int
  44. :param conv_test: maximum variation in mean differences before
  45. deemed convergent
  46. :type conv_test: number
  47. :param initial_means: set of k initial means
  48. :type initial_means: sequence of vectors
  49. :param normalise: should vectors be normalised to length 1
  50. :type normalise: boolean
  51. :param svd_dimensions: number of dimensions to use in reducing vector
  52. dimensionsionality with SVD
  53. :type svd_dimensions: int
  54. :param rng: random number generator (or None)
  55. :type rng: Random
  56. :param avoid_empty_clusters: include current centroid in computation
  57. of next one; avoids undefined behavior
  58. when clusters become empty
  59. :type avoid_empty_clusters: boolean
  60. """
  61. VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
  62. self._num_means = num_means
  63. self._distance = distance
  64. self._max_difference = conv_test
  65. assert not initial_means or len(initial_means) == num_means
  66. self._means = initial_means
  67. assert repeats >= 1
  68. assert not (initial_means and repeats > 1)
  69. self._repeats = repeats
  70. self._rng = rng if rng else random.Random()
  71. self._avoid_empty_clusters = avoid_empty_clusters
  72. def cluster_vectorspace(self, vectors, trace=False):
  73. if self._means and self._repeats > 1:
  74. print("Warning: means will be discarded for subsequent trials")
  75. meanss = []
  76. for trial in range(self._repeats):
  77. if trace:
  78. print("k-means trial", trial)
  79. if not self._means or trial > 1:
  80. self._means = self._rng.sample(list(vectors), self._num_means)
  81. self._cluster_vectorspace(vectors, trace)
  82. meanss.append(self._means)
  83. if len(meanss) > 1:
  84. # sort the means first (so that different cluster numbering won't
  85. # effect the distance comparison)
  86. for means in meanss:
  87. means.sort(key=sum)
  88. # find the set of means that's minimally different from the others
  89. min_difference = min_means = None
  90. for i in range(len(meanss)):
  91. d = 0
  92. for j in range(len(meanss)):
  93. if i != j:
  94. d += self._sum_distances(meanss[i], meanss[j])
  95. if min_difference is None or d < min_difference:
  96. min_difference, min_means = d, meanss[i]
  97. # use the best means
  98. self._means = min_means
  99. def _cluster_vectorspace(self, vectors, trace=False):
  100. if self._num_means < len(vectors):
  101. # perform k-means clustering
  102. converged = False
  103. while not converged:
  104. # assign the tokens to clusters based on minimum distance to
  105. # the cluster means
  106. clusters = [[] for m in range(self._num_means)]
  107. for vector in vectors:
  108. index = self.classify_vectorspace(vector)
  109. clusters[index].append(vector)
  110. if trace:
  111. print("iteration")
  112. # for i in range(self._num_means):
  113. # print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
  114. # recalculate cluster means by computing the centroid of each cluster
  115. new_means = list(map(self._centroid, clusters, self._means))
  116. # measure the degree of change from the previous step for convergence
  117. difference = self._sum_distances(self._means, new_means)
  118. if difference < self._max_difference:
  119. converged = True
  120. # remember the new means
  121. self._means = new_means
  122. def classify_vectorspace(self, vector):
  123. # finds the closest cluster centroid
  124. # returns that cluster's index
  125. best_distance = best_index = None
  126. for index in range(len(self._means)):
  127. mean = self._means[index]
  128. dist = self._distance(vector, mean)
  129. if best_distance is None or dist < best_distance:
  130. best_index, best_distance = index, dist
  131. return best_index
  132. def num_clusters(self):
  133. if self._means:
  134. return len(self._means)
  135. else:
  136. return self._num_means
  137. def means(self):
  138. """
  139. The means used for clustering.
  140. """
  141. return self._means
  142. def _sum_distances(self, vectors1, vectors2):
  143. difference = 0.0
  144. for u, v in zip(vectors1, vectors2):
  145. difference += self._distance(u, v)
  146. return difference
  147. def _centroid(self, cluster, mean):
  148. if self._avoid_empty_clusters:
  149. centroid = copy.copy(mean)
  150. for vector in cluster:
  151. centroid += vector
  152. return centroid / (1 + len(cluster))
  153. else:
  154. if not len(cluster):
  155. sys.stderr.write("Error: no centroid defined for empty cluster.\n")
  156. sys.stderr.write(
  157. "Try setting argument 'avoid_empty_clusters' to True\n"
  158. )
  159. assert False
  160. centroid = copy.copy(cluster[0])
  161. for vector in cluster[1:]:
  162. centroid += vector
  163. return centroid / len(cluster)
  164. def __repr__(self):
  165. return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
  166. #################################################################################
  167. def demo():
  168. # example from figure 14.9, page 517, Manning and Schutze
  169. from nltk.cluster import KMeansClusterer, euclidean_distance
  170. vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
  171. means = [[4, 3], [5, 5]]
  172. clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
  173. clusters = clusterer.cluster(vectors, True, trace=True)
  174. print("Clustered:", vectors)
  175. print("As:", clusters)
  176. print("Means:", clusterer.means())
  177. print()
  178. vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
  179. # test k-means using the euclidean distance metric, 2 means and repeat
  180. # clustering 10 times with random seeds
  181. clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
  182. clusters = clusterer.cluster(vectors, True)
  183. print("Clustered:", vectors)
  184. print("As:", clusters)
  185. print("Means:", clusterer.means())
  186. print()
  187. # classify a new vector
  188. vector = numpy.array([3, 3])
  189. print("classify(%s):" % vector, end=" ")
  190. print(clusterer.classify(vector))
  191. print()
  192. if __name__ == "__main__":
  193. demo()