pros_cons.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # Natural Language Toolkit: Pros and Cons Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CorpusReader for the Pros and Cons dataset.
  9. - Pros and Cons dataset information -
  10. Contact: Bing Liu, liub@cs.uic.edu
  11. http://www.cs.uic.edu/~liub
  12. Distributed with permission.
  13. Related papers:
  14. - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
  15. Proceedings of the 22nd International Conference on Computational Linguistics
  16. (Coling-2008), Manchester, 18-22 August, 2008.
  17. - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
  18. Opinions on the Web". Proceedings of the 14th international World Wide Web
  19. conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
  20. """
  21. import re
  22. from nltk.corpus.reader.api import *
  23. from nltk.tokenize import *
  24. class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
  25. """
  26. Reader for the Pros and Cons sentence dataset.
  27. >>> from nltk.corpus import pros_cons
  28. >>> pros_cons.sents(categories='Cons')
  29. [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
  30. 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
  31. ...]
  32. >>> pros_cons.words('IntegratedPros.txt')
  33. ['Easy', 'to', 'use', ',', 'economical', '!', ...]
  34. """
  35. CorpusView = StreamBackedCorpusView
  36. def __init__(
  37. self,
  38. root,
  39. fileids,
  40. word_tokenizer=WordPunctTokenizer(),
  41. encoding="utf8",
  42. **kwargs
  43. ):
  44. """
  45. :param root: The root directory for the corpus.
  46. :param fileids: a list or regexp specifying the fileids in the corpus.
  47. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
  48. into words. Default: `WhitespaceTokenizer`
  49. :param encoding: the encoding that should be used to read the corpus.
  50. :param kwargs: additional parameters passed to CategorizedCorpusReader.
  51. """
  52. CorpusReader.__init__(self, root, fileids, encoding)
  53. CategorizedCorpusReader.__init__(self, kwargs)
  54. self._word_tokenizer = word_tokenizer
  55. def sents(self, fileids=None, categories=None):
  56. """
  57. Return all sentences in the corpus or in the specified files/categories.
  58. :param fileids: a list or regexp specifying the ids of the files whose
  59. sentences have to be returned.
  60. :param categories: a list specifying the categories whose sentences
  61. have to be returned.
  62. :return: the given file(s) as a list of sentences. Each sentence is
  63. tokenized using the specified word_tokenizer.
  64. :rtype: list(list(str))
  65. """
  66. fileids = self._resolve(fileids, categories)
  67. if fileids is None:
  68. fileids = self._fileids
  69. elif isinstance(fileids, str):
  70. fileids = [fileids]
  71. return concat(
  72. [
  73. self.CorpusView(path, self._read_sent_block, encoding=enc)
  74. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  75. ]
  76. )
  77. def words(self, fileids=None, categories=None):
  78. """
  79. Return all words and punctuation symbols in the corpus or in the specified
  80. files/categories.
  81. :param fileids: a list or regexp specifying the ids of the files whose
  82. words have to be returned.
  83. :param categories: a list specifying the categories whose words have
  84. to be returned.
  85. :return: the given file(s) as a list of words and punctuation symbols.
  86. :rtype: list(str)
  87. """
  88. fileids = self._resolve(fileids, categories)
  89. if fileids is None:
  90. fileids = self._fileids
  91. elif isinstance(fileids, str):
  92. fileids = [fileids]
  93. return concat(
  94. [
  95. self.CorpusView(path, self._read_word_block, encoding=enc)
  96. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  97. ]
  98. )
  99. def _read_sent_block(self, stream):
  100. sents = []
  101. for i in range(20): # Read 20 lines at a time.
  102. line = stream.readline()
  103. if not line:
  104. continue
  105. sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
  106. if sent:
  107. sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
  108. return sents
  109. def _read_word_block(self, stream):
  110. words = []
  111. for sent in self._read_sent_block(stream):
  112. words.extend(sent)
  113. return words
  114. def _resolve(self, fileids, categories):
  115. if fileids is not None and categories is not None:
  116. raise ValueError("Specify fileids or categories, not both")
  117. if categories is not None:
  118. return self.fileids(categories)
  119. else:
  120. return fileids