sentiment_analyzer.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. # coding: utf-8
  2. #
  3. # Natural Language Toolkit: Sentiment Analyzer
  4. #
  5. # Copyright (C) 2001-2020 NLTK Project
  6. # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks
  11. using NLTK features and classifiers, especially for teaching and demonstrative
  12. purposes.
  13. """
  14. import sys
  15. from collections import defaultdict
  16. from nltk.classify.util import apply_features, accuracy as eval_accuracy
  17. from nltk.collocations import BigramCollocationFinder
  18. from nltk.metrics import (
  19. BigramAssocMeasures,
  20. precision as eval_precision,
  21. recall as eval_recall,
  22. f_measure as eval_f_measure,
  23. )
  24. from nltk.probability import FreqDist
  25. class SentimentAnalyzer(object):
  26. """
  27. A Sentiment Analysis tool based on machine learning approaches.
  28. """
  29. def __init__(self, classifier=None):
  30. self.feat_extractors = defaultdict(list)
  31. self.classifier = classifier
  32. def all_words(self, documents, labeled=None):
  33. """
  34. Return all words/tokens from the documents (with duplicates).
  35. :param documents: a list of (words, label) tuples.
  36. :param labeled: if `True`, assume that each document is represented by a
  37. (words, label) tuple: (list(str), str). If `False`, each document is
  38. considered as being a simple list of strings: list(str).
  39. :rtype: list(str)
  40. :return: A list of all words/tokens in `documents`.
  41. """
  42. all_words = []
  43. if labeled is None:
  44. labeled = documents and isinstance(documents[0], tuple)
  45. if labeled == True:
  46. for words, sentiment in documents:
  47. all_words.extend(words)
  48. elif labeled == False:
  49. for words in documents:
  50. all_words.extend(words)
  51. return all_words
  52. def apply_features(self, documents, labeled=None):
  53. """
  54. Apply all feature extractor functions to the documents. This is a wrapper
  55. around `nltk.classify.util.apply_features`.
  56. If `labeled=False`, return featuresets as:
  57. [feature_func(doc) for doc in documents]
  58. If `labeled=True`, return featuresets as:
  59. [(feature_func(tok), label) for (tok, label) in toks]
  60. :param documents: a list of documents. `If labeled=True`, the method expects
  61. a list of (words, label) tuples.
  62. :rtype: LazyMap
  63. """
  64. return apply_features(self.extract_features, documents, labeled)
  65. def unigram_word_feats(self, words, top_n=None, min_freq=0):
  66. """
  67. Return most common top_n word features.
  68. :param words: a list of words/tokens.
  69. :param top_n: number of best words/tokens to use, sorted by frequency.
  70. :rtype: list(str)
  71. :return: A list of `top_n` words/tokens (with no duplicates) sorted by
  72. frequency.
  73. """
  74. # Stopwords are not removed
  75. unigram_feats_freqs = FreqDist(word for word in words)
  76. return [
  77. w
  78. for w, f in unigram_feats_freqs.most_common(top_n)
  79. if unigram_feats_freqs[w] > min_freq
  80. ]
  81. def bigram_collocation_feats(
  82. self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi
  83. ):
  84. """
  85. Return `top_n` bigram features (using `assoc_measure`).
  86. Note that this method is based on bigram collocations measures, and not
  87. on simple bigram frequency.
  88. :param documents: a list (or iterable) of tokens.
  89. :param top_n: number of best words/tokens to use, sorted by association
  90. measure.
  91. :param assoc_measure: bigram association measure to use as score function.
  92. :param min_freq: the minimum number of occurrencies of bigrams to take
  93. into consideration.
  94. :return: `top_n` ngrams scored by the given association measure.
  95. """
  96. finder = BigramCollocationFinder.from_documents(documents)
  97. finder.apply_freq_filter(min_freq)
  98. return finder.nbest(assoc_measure, top_n)
  99. def classify(self, instance):
  100. """
  101. Classify a single instance applying the features that have already been
  102. stored in the SentimentAnalyzer.
  103. :param instance: a list (or iterable) of tokens.
  104. :return: the classification result given by applying the classifier.
  105. """
  106. instance_feats = self.apply_features([instance], labeled=False)
  107. return self.classifier.classify(instance_feats[0])
  108. def add_feat_extractor(self, function, **kwargs):
  109. """
  110. Add a new function to extract features from a document. This function will
  111. be used in extract_features().
  112. Important: in this step our kwargs are only representing additional parameters,
  113. and NOT the document we have to parse. The document will always be the first
  114. parameter in the parameter list, and it will be added in the extract_features()
  115. function.
  116. :param function: the extractor function to add to the list of feature extractors.
  117. :param kwargs: additional parameters required by the `function` function.
  118. """
  119. self.feat_extractors[function].append(kwargs)
  120. def extract_features(self, document):
  121. """
  122. Apply extractor functions (and their parameters) to the present document.
  123. We pass `document` as the first parameter of the extractor functions.
  124. If we want to use the same extractor function multiple times, we have to
  125. add it to the extractors with `add_feat_extractor` using multiple sets of
  126. parameters (one for each call of the extractor function).
  127. :param document: the document that will be passed as argument to the
  128. feature extractor functions.
  129. :return: A dictionary of populated features extracted from the document.
  130. :rtype: dict
  131. """
  132. all_features = {}
  133. for extractor in self.feat_extractors:
  134. for param_set in self.feat_extractors[extractor]:
  135. feats = extractor(document, **param_set)
  136. all_features.update(feats)
  137. return all_features
  138. def train(self, trainer, training_set, save_classifier=None, **kwargs):
  139. """
  140. Train classifier on the training set, optionally saving the output in the
  141. file specified by `save_classifier`.
  142. Additional arguments depend on the specific trainer used. For example,
  143. a MaxentClassifier can use `max_iter` parameter to specify the number
  144. of iterations, while a NaiveBayesClassifier cannot.
  145. :param trainer: `train` method of a classifier.
  146. E.g.: NaiveBayesClassifier.train
  147. :param training_set: the training set to be passed as argument to the
  148. classifier `train` method.
  149. :param save_classifier: the filename of the file where the classifier
  150. will be stored (optional).
  151. :param kwargs: additional parameters that will be passed as arguments to
  152. the classifier `train` function.
  153. :return: A classifier instance trained on the training set.
  154. :rtype:
  155. """
  156. print("Training classifier")
  157. self.classifier = trainer(training_set, **kwargs)
  158. if save_classifier:
  159. self.save_file(self.classifier, save_classifier)
  160. return self.classifier
  161. def save_file(self, content, filename):
  162. """
  163. Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
  164. """
  165. print("Saving", filename, file=sys.stderr)
  166. with open(filename, 'wb') as storage_file:
  167. # The protocol=2 parameter is for python2 compatibility
  168. pickle.dump(content, storage_file, protocol=2)
  169. def evaluate(
  170. self,
  171. test_set,
  172. classifier=None,
  173. accuracy=True,
  174. f_measure=True,
  175. precision=True,
  176. recall=True,
  177. verbose=False,
  178. ):
  179. """
  180. Evaluate and print classifier performance on the test set.
  181. :param test_set: A list of (tokens, label) tuples to use as gold set.
  182. :param classifier: a classifier instance (previously trained).
  183. :param accuracy: if `True`, evaluate classifier accuracy.
  184. :param f_measure: if `True`, evaluate classifier f_measure.
  185. :param precision: if `True`, evaluate classifier precision.
  186. :param recall: if `True`, evaluate classifier recall.
  187. :return: evaluation results.
  188. :rtype: dict(str): float
  189. """
  190. if classifier is None:
  191. classifier = self.classifier
  192. print("Evaluating {0} results...".format(type(classifier).__name__))
  193. metrics_results = {}
  194. if accuracy == True:
  195. accuracy_score = eval_accuracy(classifier, test_set)
  196. metrics_results["Accuracy"] = accuracy_score
  197. gold_results = defaultdict(set)
  198. test_results = defaultdict(set)
  199. labels = set()
  200. for i, (feats, label) in enumerate(test_set):
  201. labels.add(label)
  202. gold_results[label].add(i)
  203. observed = classifier.classify(feats)
  204. test_results[observed].add(i)
  205. for label in labels:
  206. if precision == True:
  207. precision_score = eval_precision(
  208. gold_results[label], test_results[label]
  209. )
  210. metrics_results["Precision [{0}]".format(label)] = precision_score
  211. if recall == True:
  212. recall_score = eval_recall(gold_results[label], test_results[label])
  213. metrics_results["Recall [{0}]".format(label)] = recall_score
  214. if f_measure == True:
  215. f_measure_score = eval_f_measure(
  216. gold_results[label], test_results[label]
  217. )
  218. metrics_results["F-measure [{0}]".format(label)] = f_measure_score
  219. # Print evaluation results (in alphabetical order)
  220. if verbose == True:
  221. for result in sorted(metrics_results):
  222. print("{0}: {1}".format(result, metrics_results[result]))
  223. return metrics_results