reviews.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. # Natural Language Toolkit: Product Reviews Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
  9. - Customer Review Corpus information -
  10. Annotated by: Minqing Hu and Bing Liu, 2004.
  11. Department of Computer Sicence
  12. University of Illinois at Chicago
  13. Contact: Bing Liu, liub@cs.uic.edu
  14. http://www.cs.uic.edu/~liub
  15. Distributed with permission.
  16. The "product_reviews_1" and "product_reviews_2" datasets respectively contain
  17. annotated customer reviews of 5 and 9 products from amazon.com.
  18. Related papers:
  19. - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
  20. Proceedings of the ACM SIGKDD International Conference on Knowledge
  21. Discovery & Data Mining (KDD-04), 2004.
  22. - Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
  23. Proceedings of Nineteeth National Conference on Artificial Intelligence
  24. (AAAI-2004), 2004.
  25. - Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
  26. Opinion Mining." Proceedings of First ACM International Conference on Web
  27. Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
  28. Stanford, California, USA.
  29. Symbols used in the annotated reviews:
  30. [t] : the title of the review: Each [t] tag starts a review.
  31. xxxx[+|-n]: xxxx is a product feature.
  32. [+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
  33. Note that the strength is quite subjective.
  34. You may want ignore it, but only considering + and -
  35. [-n]: Negative opinion
  36. ## : start of each sentence. Each line is a sentence.
  37. [u] : feature not appeared in the sentence.
  38. [p] : feature not appeared in the sentence. Pronoun resolution is needed.
  39. [s] : suggestion or recommendation.
  40. [cc]: comparison with a competing product from a different brand.
  41. [cs]: comparison with a competing product from the same brand.
  42. Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
  43. provide separation between different reviews. This is due to the fact that
  44. the dataset was specifically designed for aspect/feature-based sentiment
  45. analysis, for which sentence-level annotation is sufficient. For document-
  46. level classification and analysis, this peculiarity should be taken into
  47. consideration.
  48. """
  49. import re
  50. from nltk.corpus.reader.api import *
  51. from nltk.tokenize import *
  52. TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
  53. FEATURES = re.compile(
  54. r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
  55. ) # find 'feature' in feature[+3]
  56. NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
  57. SENT = re.compile(r"##(.*)$") # find tokenized sentence
  58. class Review(object):
  59. """
  60. A Review is the main block of a ReviewsCorpusReader.
  61. """
  62. def __init__(self, title=None, review_lines=None):
  63. """
  64. :param title: the title of the review.
  65. :param review_lines: the list of the ReviewLines that belong to the Review.
  66. """
  67. self.title = title
  68. if review_lines is None:
  69. self.review_lines = []
  70. else:
  71. self.review_lines = review_lines
  72. def add_line(self, review_line):
  73. """
  74. Add a line (ReviewLine) to the review.
  75. :param review_line: a ReviewLine instance that belongs to the Review.
  76. """
  77. assert isinstance(review_line, ReviewLine)
  78. self.review_lines.append(review_line)
  79. def features(self):
  80. """
  81. Return a list of features in the review. Each feature is a tuple made of
  82. the specific item feature and the opinion strength about that feature.
  83. :return: all features of the review as a list of tuples (feat, score).
  84. :rtype: list(tuple)
  85. """
  86. features = []
  87. for review_line in self.review_lines:
  88. features.extend(review_line.features)
  89. return features
  90. def sents(self):
  91. """
  92. Return all tokenized sentences in the review.
  93. :return: all sentences of the review as lists of tokens.
  94. :rtype: list(list(str))
  95. """
  96. return [review_line.sent for review_line in self.review_lines]
  97. def __repr__(self):
  98. return 'Review(title="{}", review_lines={})'.format(
  99. self.title, self.review_lines
  100. )
  101. class ReviewLine(object):
  102. """
  103. A ReviewLine represents a sentence of the review, together with (optional)
  104. annotations of its features and notes about the reviewed item.
  105. """
  106. def __init__(self, sent, features=None, notes=None):
  107. self.sent = sent
  108. if features is None:
  109. self.features = []
  110. else:
  111. self.features = features
  112. if notes is None:
  113. self.notes = []
  114. else:
  115. self.notes = notes
  116. def __repr__(self):
  117. return "ReviewLine(features={}, notes={}, sent={})".format(
  118. self.features, self.notes, self.sent
  119. )
  120. class ReviewsCorpusReader(CorpusReader):
  121. """
  122. Reader for the Customer Review Data dataset by Hu, Liu (2004).
  123. Note: we are not applying any sentence tokenization at the moment, just word
  124. tokenization.
  125. >>> from nltk.corpus import product_reviews_1
  126. >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
  127. >>> review = camera_reviews[0]
  128. >>> review.sents()[0]
  129. ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
  130. 'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
  131. >>> review.features()
  132. [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
  133. ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
  134. ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
  135. ('option', '+1')]
  136. We can also reach the same information directly from the stream:
  137. >>> product_reviews_1.features('Canon_G3.txt')
  138. [('canon powershot g3', '+3'), ('use', '+2'), ...]
  139. We can compute stats for specific product features:
  140. >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
  141. >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
  142. >>> mean = tot / n_reviews
  143. >>> print(n_reviews, tot, mean)
  144. 15 24 1.6
  145. """
  146. CorpusView = StreamBackedCorpusView
  147. def __init__(
  148. self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
  149. ):
  150. """
  151. :param root: The root directory for the corpus.
  152. :param fileids: a list or regexp specifying the fileids in the corpus.
  153. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
  154. into words. Default: `WordPunctTokenizer`
  155. :param encoding: the encoding that should be used to read the corpus.
  156. """
  157. CorpusReader.__init__(self, root, fileids, encoding)
  158. self._word_tokenizer = word_tokenizer
  159. def features(self, fileids=None):
  160. """
  161. Return a list of features. Each feature is a tuple made of the specific
  162. item feature and the opinion strength about that feature.
  163. :param fileids: a list or regexp specifying the ids of the files whose
  164. features have to be returned.
  165. :return: all features for the item(s) in the given file(s).
  166. :rtype: list(tuple)
  167. """
  168. if fileids is None:
  169. fileids = self._fileids
  170. elif isinstance(fileids, str):
  171. fileids = [fileids]
  172. return concat(
  173. [
  174. self.CorpusView(fileid, self._read_features, encoding=enc)
  175. for (fileid, enc) in self.abspaths(fileids, True)
  176. ]
  177. )
  178. def raw(self, fileids=None):
  179. """
  180. :param fileids: a list or regexp specifying the fileids of the files that
  181. have to be returned as a raw string.
  182. :return: the given file(s) as a single string.
  183. :rtype: str
  184. """
  185. if fileids is None:
  186. fileids = self._fileids
  187. elif isinstance(fileids, str):
  188. fileids = [fileids]
  189. return concat([self.open(f).read() for f in fileids])
  190. def readme(self):
  191. """
  192. Return the contents of the corpus README.txt file.
  193. """
  194. return self.open("README.txt").read()
  195. def reviews(self, fileids=None):
  196. """
  197. Return all the reviews as a list of Review objects. If `fileids` is
  198. specified, return all the reviews from each of the specified files.
  199. :param fileids: a list or regexp specifying the ids of the files whose
  200. reviews have to be returned.
  201. :return: the given file(s) as a list of reviews.
  202. """
  203. if fileids is None:
  204. fileids = self._fileids
  205. return concat(
  206. [
  207. self.CorpusView(fileid, self._read_review_block, encoding=enc)
  208. for (fileid, enc) in self.abspaths(fileids, True)
  209. ]
  210. )
  211. def sents(self, fileids=None):
  212. """
  213. Return all sentences in the corpus or in the specified files.
  214. :param fileids: a list or regexp specifying the ids of the files whose
  215. sentences have to be returned.
  216. :return: the given file(s) as a list of sentences, each encoded as a
  217. list of word strings.
  218. :rtype: list(list(str))
  219. """
  220. return concat(
  221. [
  222. self.CorpusView(path, self._read_sent_block, encoding=enc)
  223. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  224. ]
  225. )
  226. def words(self, fileids=None):
  227. """
  228. Return all words and punctuation symbols in the corpus or in the specified
  229. files.
  230. :param fileids: a list or regexp specifying the ids of the files whose
  231. words have to be returned.
  232. :return: the given file(s) as a list of words and punctuation symbols.
  233. :rtype: list(str)
  234. """
  235. return concat(
  236. [
  237. self.CorpusView(path, self._read_word_block, encoding=enc)
  238. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  239. ]
  240. )
  241. def _read_features(self, stream):
  242. features = []
  243. for i in range(20):
  244. line = stream.readline()
  245. if not line:
  246. return features
  247. features.extend(re.findall(FEATURES, line))
  248. return features
  249. def _read_review_block(self, stream):
  250. while True:
  251. line = stream.readline()
  252. if not line:
  253. return [] # end of file.
  254. title_match = re.match(TITLE, line)
  255. if title_match:
  256. review = Review(
  257. title=title_match.group(1).strip()
  258. ) # We create a new review
  259. break
  260. # Scan until we find another line matching the regexp, or EOF.
  261. while True:
  262. oldpos = stream.tell()
  263. line = stream.readline()
  264. # End of file:
  265. if not line:
  266. return [review]
  267. # Start of a new review: backup to just before it starts, and
  268. # return the review we've already collected.
  269. if re.match(TITLE, line):
  270. stream.seek(oldpos)
  271. return [review]
  272. # Anything else is part of the review line.
  273. feats = re.findall(FEATURES, line)
  274. notes = re.findall(NOTES, line)
  275. sent = re.findall(SENT, line)
  276. if sent:
  277. sent = self._word_tokenizer.tokenize(sent[0])
  278. review_line = ReviewLine(sent=sent, features=feats, notes=notes)
  279. review.add_line(review_line)
  280. def _read_sent_block(self, stream):
  281. sents = []
  282. for review in self._read_review_block(stream):
  283. sents.extend([sent for sent in review.sents()])
  284. return sents
  285. def _read_word_block(self, stream):
  286. words = []
  287. for i in range(20): # Read 20 lines at a time.
  288. line = stream.readline()
  289. sent = re.findall(SENT, line)
  290. if sent:
  291. words.extend(self._word_tokenizer.tokenize(sent[0]))
  292. return words