util.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. # Natural Language Toolkit: Classifier Utility Functions
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com> (minor additions)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Utility functions and classes for classifiers.
  10. """
  11. import math
  12. # from nltk.util import Deprecated
  13. import nltk.classify.util # for accuracy & log_likelihood
  14. from nltk.util import LazyMap
  15. ######################################################################
  16. # { Helper Functions
  17. ######################################################################
  18. # alternative name possibility: 'map_featurefunc()'?
  19. # alternative name possibility: 'detect_features()'?
  20. # alternative name possibility: 'map_featuredetect()'?
  21. # or.. just have users use LazyMap directly?
  22. def apply_features(feature_func, toks, labeled=None):
  23. """
  24. Use the ``LazyMap`` class to construct a lazy list-like
  25. object that is analogous to ``map(feature_func, toks)``. In
  26. particular, if ``labeled=False``, then the returned list-like
  27. object's values are equal to::
  28. [feature_func(tok) for tok in toks]
  29. If ``labeled=True``, then the returned list-like object's values
  30. are equal to::
  31. [(feature_func(tok), label) for (tok, label) in toks]
  32. The primary purpose of this function is to avoid the memory
  33. overhead involved in storing all the featuresets for every token
  34. in a corpus. Instead, these featuresets are constructed lazily,
  35. as-needed. The reduction in memory overhead can be especially
  36. significant when the underlying list of tokens is itself lazy (as
  37. is the case with many corpus readers).
  38. :param feature_func: The function that will be applied to each
  39. token. It should return a featureset -- i.e., a dict
  40. mapping feature names to feature values.
  41. :param toks: The list of tokens to which ``feature_func`` should be
  42. applied. If ``labeled=True``, then the list elements will be
  43. passed directly to ``feature_func()``. If ``labeled=False``,
  44. then the list elements should be tuples ``(tok,label)``, and
  45. ``tok`` will be passed to ``feature_func()``.
  46. :param labeled: If true, then ``toks`` contains labeled tokens --
  47. i.e., tuples of the form ``(tok, label)``. (Default:
  48. auto-detect based on types.)
  49. """
  50. if labeled is None:
  51. labeled = toks and isinstance(toks[0], (tuple, list))
  52. if labeled:
  53. def lazy_func(labeled_token):
  54. return (feature_func(labeled_token[0]), labeled_token[1])
  55. return LazyMap(lazy_func, toks)
  56. else:
  57. return LazyMap(feature_func, toks)
  58. def attested_labels(tokens):
  59. """
  60. :return: A list of all labels that are attested in the given list
  61. of tokens.
  62. :rtype: list of (immutable)
  63. :param tokens: The list of classified tokens from which to extract
  64. labels. A classified token has the form ``(token, label)``.
  65. :type tokens: list
  66. """
  67. return tuple(set(label for (tok, label) in tokens))
  68. def log_likelihood(classifier, gold):
  69. results = classifier.prob_classify_many([fs for (fs, l) in gold])
  70. ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
  71. return math.log(sum(ll) / len(ll))
  72. def accuracy(classifier, gold):
  73. results = classifier.classify_many([fs for (fs, l) in gold])
  74. correct = [l == r for ((fs, l), r) in zip(gold, results)]
  75. if correct:
  76. return sum(correct) / len(correct)
  77. else:
  78. return 0
  79. class CutoffChecker(object):
  80. """
  81. A helper class that implements cutoff checks based on number of
  82. iterations and log likelihood.
  83. Accuracy cutoffs are also implemented, but they're almost never
  84. a good idea to use.
  85. """
  86. def __init__(self, cutoffs):
  87. self.cutoffs = cutoffs.copy()
  88. if "min_ll" in cutoffs:
  89. cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
  90. if "min_lldelta" in cutoffs:
  91. cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
  92. self.ll = None
  93. self.acc = None
  94. self.iter = 1
  95. def check(self, classifier, train_toks):
  96. cutoffs = self.cutoffs
  97. self.iter += 1
  98. if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
  99. return True # iteration cutoff.
  100. new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
  101. if math.isnan(new_ll):
  102. return True
  103. if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
  104. if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
  105. return True # log likelihood cutoff
  106. if (
  107. "min_lldelta" in cutoffs
  108. and self.ll
  109. and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
  110. ):
  111. return True # log likelihood delta cutoff
  112. self.ll = new_ll
  113. if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
  114. new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
  115. if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
  116. return True # log likelihood cutoff
  117. if (
  118. "min_accdelta" in cutoffs
  119. and self.acc
  120. and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
  121. ):
  122. return True # log likelihood delta cutoff
  123. self.acc = new_acc
  124. return False # no cutoff reached.
  125. ######################################################################
  126. # { Demos
  127. ######################################################################
  128. def names_demo_features(name):
  129. features = {}
  130. features["alwayson"] = True
  131. features["startswith"] = name[0].lower()
  132. features["endswith"] = name[-1].lower()
  133. for letter in "abcdefghijklmnopqrstuvwxyz":
  134. features["count(%s)" % letter] = name.lower().count(letter)
  135. features["has(%s)" % letter] = letter in name.lower()
  136. return features
  137. def binary_names_demo_features(name):
  138. features = {}
  139. features["alwayson"] = True
  140. features["startswith(vowel)"] = name[0].lower() in "aeiouy"
  141. features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
  142. for letter in "abcdefghijklmnopqrstuvwxyz":
  143. features["count(%s)" % letter] = name.lower().count(letter)
  144. features["has(%s)" % letter] = letter in name.lower()
  145. features["startswith(%s)" % letter] = letter == name[0].lower()
  146. features["endswith(%s)" % letter] = letter == name[-1].lower()
  147. return features
  148. def names_demo(trainer, features=names_demo_features):
  149. from nltk.corpus import names
  150. import random
  151. # Construct a list of classified names, using the names corpus.
  152. namelist = [(name, "male") for name in names.words("male.txt")] + [
  153. (name, "female") for name in names.words("female.txt")
  154. ]
  155. # Randomly split the names into a test & train set.
  156. random.seed(123456)
  157. random.shuffle(namelist)
  158. train = namelist[:5000]
  159. test = namelist[5000:5500]
  160. # Train up a classifier.
  161. print("Training classifier...")
  162. classifier = trainer([(features(n), g) for (n, g) in train])
  163. # Run the classifier on the test data.
  164. print("Testing classifier...")
  165. acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
  166. print("Accuracy: %6.4f" % acc)
  167. # For classifiers that can find probabilities, show the log
  168. # likelihood and some sample probability distributions.
  169. try:
  170. test_featuresets = [features(n) for (n, g) in test]
  171. pdists = classifier.prob_classify_many(test_featuresets)
  172. ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
  173. print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
  174. print()
  175. print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
  176. for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
  177. if gender == "male":
  178. fmt = " %-15s *%6.4f %6.4f"
  179. else:
  180. fmt = " %-15s %6.4f *%6.4f"
  181. print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
  182. except NotImplementedError:
  183. pass
  184. # Return the classifier
  185. return classifier
  186. def partial_names_demo(trainer, features=names_demo_features):
  187. from nltk.corpus import names
  188. import random
  189. male_names = names.words("male.txt")
  190. female_names = names.words("female.txt")
  191. random.seed(654321)
  192. random.shuffle(male_names)
  193. random.shuffle(female_names)
  194. # Create a list of male names to be used as positive-labeled examples for training
  195. positive = map(features, male_names[:2000])
  196. # Create a list of male and female names to be used as unlabeled examples
  197. unlabeled = map(features, male_names[2000:2500] + female_names[:500])
  198. # Create a test set with correctly-labeled male and female names
  199. test = [(name, True) for name in male_names[2500:2750]] + [
  200. (name, False) for name in female_names[500:750]
  201. ]
  202. random.shuffle(test)
  203. # Train up a classifier.
  204. print("Training classifier...")
  205. classifier = trainer(positive, unlabeled)
  206. # Run the classifier on the test data.
  207. print("Testing classifier...")
  208. acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
  209. print("Accuracy: %6.4f" % acc)
  210. # For classifiers that can find probabilities, show the log
  211. # likelihood and some sample probability distributions.
  212. try:
  213. test_featuresets = [features(n) for (n, m) in test]
  214. pdists = classifier.prob_classify_many(test_featuresets)
  215. ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
  216. print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
  217. print()
  218. print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
  219. for ((name, is_male), pdist) in zip(test, pdists)[:5]:
  220. if is_male == True:
  221. fmt = " %-15s *%6.4f %6.4f"
  222. else:
  223. fmt = " %-15s %6.4f *%6.4f"
  224. print(fmt % (name, pdist.prob(True), pdist.prob(False)))
  225. except NotImplementedError:
  226. pass
  227. # Return the classifier
  228. return classifier
  229. _inst_cache = {}
  230. def wsd_demo(trainer, word, features, n=1000):
  231. from nltk.corpus import senseval
  232. import random
  233. # Get the instances.
  234. print("Reading data...")
  235. global _inst_cache
  236. if word not in _inst_cache:
  237. _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
  238. instances = _inst_cache[word][:]
  239. if n > len(instances):
  240. n = len(instances)
  241. senses = list(set(l for (i, l) in instances))
  242. print(" Senses: " + " ".join(senses))
  243. # Randomly split the names into a test & train set.
  244. print("Splitting into test & train...")
  245. random.seed(123456)
  246. random.shuffle(instances)
  247. train = instances[: int(0.8 * n)]
  248. test = instances[int(0.8 * n) : n]
  249. # Train up a classifier.
  250. print("Training classifier...")
  251. classifier = trainer([(features(i), l) for (i, l) in train])
  252. # Run the classifier on the test data.
  253. print("Testing classifier...")
  254. acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
  255. print("Accuracy: %6.4f" % acc)
  256. # For classifiers that can find probabilities, show the log
  257. # likelihood and some sample probability distributions.
  258. try:
  259. test_featuresets = [features(i) for (i, n) in test]
  260. pdists = classifier.prob_classify_many(test_featuresets)
  261. ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
  262. print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
  263. except NotImplementedError:
  264. pass
  265. # Return the classifier
  266. return classifier
  267. def check_megam_config():
  268. """
  269. Checks whether the MEGAM binary is configured.
  270. """
  271. try:
  272. _megam_bin
  273. except NameError:
  274. err_msg = str(
  275. "Please configure your megam binary first, e.g.\n"
  276. ">>> nltk.config_megam('/usr/bin/local/megam')"
  277. )
  278. raise NameError(err_msg)