opinion_lexicon.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # Natural Language Toolkit: Opinion Lexicon Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CorpusReader for the Opinion Lexicon.
  9. - Opinion Lexicon information -
  10. Authors: Minqing Hu and Bing Liu, 2004.
  11. Department of Computer Sicence
  12. University of Illinois at Chicago
  13. Contact: Bing Liu, liub@cs.uic.edu
  14. http://www.cs.uic.edu/~liub
  15. Distributed with permission.
  16. Related papers:
  17. - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
  18. Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
  19. & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
  20. - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
  21. Comparing Opinions on the Web". Proceedings of the 14th International World
  22. Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
  23. """
  24. from nltk.corpus.reader import WordListCorpusReader
  25. from nltk.corpus.reader.api import *
  26. class IgnoreReadmeCorpusView(StreamBackedCorpusView):
  27. """
  28. This CorpusView is used to skip the initial readme block of the corpus.
  29. """
  30. def __init__(self, *args, **kwargs):
  31. StreamBackedCorpusView.__init__(self, *args, **kwargs)
  32. # open self._stream
  33. self._open()
  34. # skip the readme block
  35. read_blankline_block(self._stream)
  36. # Set the initial position to the current stream position
  37. self._filepos = [self._stream.tell()]
  38. class OpinionLexiconCorpusReader(WordListCorpusReader):
  39. """
  40. Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
  41. >>> from nltk.corpus import opinion_lexicon
  42. >>> opinion_lexicon.words()
  43. ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
  44. The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
  45. words:
  46. >>> opinion_lexicon.negative()
  47. ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
  48. Note that words from `words()` method are sorted by file id, not alphabetically:
  49. >>> opinion_lexicon.words()[0:10]
  50. ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
  51. 'abominate', 'abomination', 'abort', 'aborted']
  52. >>> sorted(opinion_lexicon.words())[0:10]
  53. ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
  54. 'abominate', 'abomination', 'abort']
  55. """
  56. CorpusView = IgnoreReadmeCorpusView
  57. def words(self, fileids=None):
  58. """
  59. Return all words in the opinion lexicon. Note that these words are not
  60. sorted in alphabetical order.
  61. :param fileids: a list or regexp specifying the ids of the files whose
  62. words have to be returned.
  63. :return: the given file(s) as a list of words and punctuation symbols.
  64. :rtype: list(str)
  65. """
  66. if fileids is None:
  67. fileids = self._fileids
  68. elif isinstance(fileids, str):
  69. fileids = [fileids]
  70. return concat(
  71. [
  72. self.CorpusView(path, self._read_word_block, encoding=enc)
  73. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  74. ]
  75. )
  76. def positive(self):
  77. """
  78. Return all positive words in alphabetical order.
  79. :return: a list of positive words.
  80. :rtype: list(str)
  81. """
  82. return self.words("positive-words.txt")
  83. def negative(self):
  84. """
  85. Return all negative words in alphabetical order.
  86. :return: a list of negative words.
  87. :rtype: list(str)
  88. """
  89. return self.words("negative-words.txt")
  90. def _read_word_block(self, stream):
  91. words = []
  92. for i in range(20): # Read 20 lines at a time.
  93. line = stream.readline()
  94. if not line:
  95. continue
  96. words.append(line.strip())
  97. return words