naivebayes.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # Natural Language Toolkit: Naive Bayes Classifiers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A classifier based on the Naive Bayes algorithm. In order to find the
  9. probability for a label, this algorithm first uses the Bayes rule to
  10. express P(label|features) in terms of P(label) and P(features|label):
  11. | P(label) * P(features|label)
  12. | P(label|features) = ------------------------------
  13. | P(features)
  14. The algorithm then makes the 'naive' assumption that all features are
  15. independent, given the label:
  16. | P(label) * P(f1|label) * ... * P(fn|label)
  17. | P(label|features) = --------------------------------------------
  18. | P(features)
  19. Rather than computing P(features) explicitly, the algorithm just
  20. calculates the numerator for each label, and normalizes them so they
  21. sum to one:
  22. | P(label) * P(f1|label) * ... * P(fn|label)
  23. | P(label|features) = --------------------------------------------
  24. | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
  25. """
  26. from collections import defaultdict
  27. from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
  28. from nltk.classify.api import ClassifierI
  29. ##//////////////////////////////////////////////////////
  30. ## Naive Bayes Classifier
  31. ##//////////////////////////////////////////////////////
  32. class NaiveBayesClassifier(ClassifierI):
  33. """
  34. A Naive Bayes classifier. Naive Bayes classifiers are
  35. paramaterized by two probability distributions:
  36. - P(label) gives the probability that an input will receive each
  37. label, given no information about the input's features.
  38. - P(fname=fval|label) gives the probability that a given feature
  39. (fname) will receive a given value (fval), given that the
  40. label (label).
  41. If the classifier encounters an input with a feature that has
  42. never been seen with any label, then rather than assigning a
  43. probability of 0 to all labels, it will ignore that feature.
  44. The feature value 'None' is reserved for unseen feature values;
  45. you generally should not use 'None' as a feature value for one of
  46. your own features.
  47. """
  48. def __init__(self, label_probdist, feature_probdist):
  49. """
  50. :param label_probdist: P(label), the probability distribution
  51. over labels. It is expressed as a ``ProbDistI`` whose
  52. samples are labels. I.e., P(label) =
  53. ``label_probdist.prob(label)``.
  54. :param feature_probdist: P(fname=fval|label), the probability
  55. distribution for feature values, given labels. It is
  56. expressed as a dictionary whose keys are ``(label, fname)``
  57. pairs and whose values are ``ProbDistI`` objects over feature
  58. values. I.e., P(fname=fval|label) =
  59. ``feature_probdist[label,fname].prob(fval)``. If a given
  60. ``(label,fname)`` is not a key in ``feature_probdist``, then
  61. it is assumed that the corresponding P(fname=fval|label)
  62. is 0 for all values of ``fval``.
  63. """
  64. self._label_probdist = label_probdist
  65. self._feature_probdist = feature_probdist
  66. self._labels = list(label_probdist.samples())
  67. def labels(self):
  68. return self._labels
  69. def classify(self, featureset):
  70. return self.prob_classify(featureset).max()
  71. def prob_classify(self, featureset):
  72. # Discard any feature names that we've never seen before.
  73. # Otherwise, we'll just assign a probability of 0 to
  74. # everything.
  75. featureset = featureset.copy()
  76. for fname in list(featureset.keys()):
  77. for label in self._labels:
  78. if (label, fname) in self._feature_probdist:
  79. break
  80. else:
  81. # print('Ignoring unseen feature %s' % fname)
  82. del featureset[fname]
  83. # Find the log probabilty of each label, given the features.
  84. # Start with the log probability of the label itself.
  85. logprob = {}
  86. for label in self._labels:
  87. logprob[label] = self._label_probdist.logprob(label)
  88. # Then add in the log probability of features given labels.
  89. for label in self._labels:
  90. for (fname, fval) in featureset.items():
  91. if (label, fname) in self._feature_probdist:
  92. feature_probs = self._feature_probdist[label, fname]
  93. logprob[label] += feature_probs.logprob(fval)
  94. else:
  95. # nb: This case will never come up if the
  96. # classifier was created by
  97. # NaiveBayesClassifier.train().
  98. logprob[label] += sum_logs([]) # = -INF.
  99. return DictionaryProbDist(logprob, normalize=True, log=True)
  100. def show_most_informative_features(self, n=10):
  101. # Determine the most relevant features, and display them.
  102. cpdist = self._feature_probdist
  103. print("Most Informative Features")
  104. for (fname, fval) in self.most_informative_features(n):
  105. def labelprob(l):
  106. return cpdist[l, fname].prob(fval)
  107. labels = sorted(
  108. [l for l in self._labels if fval in cpdist[l, fname].samples()],
  109. key=lambda element: (-labelprob(element), element),
  110. reverse=True
  111. )
  112. if len(labels) == 1:
  113. continue
  114. l0 = labels[0]
  115. l1 = labels[-1]
  116. if cpdist[l0, fname].prob(fval) == 0:
  117. ratio = "INF"
  118. else:
  119. ratio = "%8.1f" % (
  120. cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
  121. )
  122. print(
  123. (
  124. "%24s = %-14r %6s : %-6s = %s : 1.0"
  125. % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
  126. )
  127. )
  128. def most_informative_features(self, n=100):
  129. """
  130. Return a list of the 'most informative' features used by this
  131. classifier. For the purpose of this function, the
  132. informativeness of a feature ``(fname,fval)`` is equal to the
  133. highest value of P(fname=fval|label), for any label, divided by
  134. the lowest value of P(fname=fval|label), for any label:
  135. | max[ P(fname=fval|label1) / P(fname=fval|label2) ]
  136. """
  137. if hasattr(self, "_most_informative_features"):
  138. return self._most_informative_features[:n]
  139. else:
  140. # The set of (fname, fval) pairs used by this classifier.
  141. features = set()
  142. # The max & min probability associated w/ each (fname, fval)
  143. # pair. Maps (fname,fval) -> float.
  144. maxprob = defaultdict(lambda: 0.0)
  145. minprob = defaultdict(lambda: 1.0)
  146. for (label, fname), probdist in self._feature_probdist.items():
  147. for fval in probdist.samples():
  148. feature = (fname, fval)
  149. features.add(feature)
  150. p = probdist.prob(fval)
  151. maxprob[feature] = max(p, maxprob[feature])
  152. minprob[feature] = min(p, minprob[feature])
  153. if minprob[feature] == 0:
  154. features.discard(feature)
  155. # Convert features to a list, & sort it by how informative
  156. # features are.
  157. self._most_informative_features = sorted(
  158. features, key=lambda feature_: (minprob[feature_] / maxprob[feature_], feature_[0],
  159. feature_[1] in [None, False, True], str(feature_[1]).lower())
  160. )
  161. return self._most_informative_features[:n]
  162. @classmethod
  163. def train(cls, labeled_featuresets, estimator=ELEProbDist):
  164. """
  165. :param labeled_featuresets: A list of classified featuresets,
  166. i.e., a list of tuples ``(featureset, label)``.
  167. """
  168. label_freqdist = FreqDist()
  169. feature_freqdist = defaultdict(FreqDist)
  170. feature_values = defaultdict(set)
  171. fnames = set()
  172. # Count up how many times each feature value occurred, given
  173. # the label and featurename.
  174. for featureset, label in labeled_featuresets:
  175. label_freqdist[label] += 1
  176. for fname, fval in featureset.items():
  177. # Increment freq(fval|label, fname)
  178. feature_freqdist[label, fname][fval] += 1
  179. # Record that fname can take the value fval.
  180. feature_values[fname].add(fval)
  181. # Keep a list of all feature names.
  182. fnames.add(fname)
  183. # If a feature didn't have a value given for an instance, then
  184. # we assume that it gets the implicit value 'None.' This loop
  185. # counts up the number of 'missing' feature values for each
  186. # (label,fname) pair, and increments the count of the fval
  187. # 'None' by that amount.
  188. for label in label_freqdist:
  189. num_samples = label_freqdist[label]
  190. for fname in fnames:
  191. count = feature_freqdist[label, fname].N()
  192. # Only add a None key when necessary, i.e. if there are
  193. # any samples with feature 'fname' missing.
  194. if num_samples - count > 0:
  195. feature_freqdist[label, fname][None] += num_samples - count
  196. feature_values[fname].add(None)
  197. # Create the P(label) distribution
  198. label_probdist = estimator(label_freqdist)
  199. # Create the P(fval|label, fname) distribution
  200. feature_probdist = {}
  201. for ((label, fname), freqdist) in feature_freqdist.items():
  202. probdist = estimator(freqdist, bins=len(feature_values[fname]))
  203. feature_probdist[label, fname] = probdist
  204. return cls(label_probdist, feature_probdist)
  205. ##//////////////////////////////////////////////////////
  206. ## Demo
  207. ##//////////////////////////////////////////////////////
  208. def demo():
  209. from nltk.classify.util import names_demo
  210. classifier = names_demo(NaiveBayesClassifier.train)
  211. classifier.show_most_informative_features()
  212. if __name__ == "__main__":
  213. demo()