comparative_sents.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. # Natural Language Toolkit: Comparative Sentence Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CorpusReader for the Comparative Sentence Dataset.
  9. - Comparative Sentence Dataset information -
  10. Annotated by: Nitin Jindal and Bing Liu, 2006.
  11. Department of Computer Sicence
  12. University of Illinois at Chicago
  13. Contact: Nitin Jindal, njindal@cs.uic.edu
  14. Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
  15. Distributed with permission.
  16. Related papers:
  17. - Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
  18. Proceedings of the ACM SIGIR International Conference on Information Retrieval
  19. (SIGIR-06), 2006.
  20. - Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
  21. Proceedings of Twenty First National Conference on Artificial Intelligence
  22. (AAAI-2006), 2006.
  23. - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
  24. Proceedings of the 22nd International Conference on Computational Linguistics
  25. (Coling-2008), Manchester, 18-22 August, 2008.
  26. """
  27. import re
  28. from nltk.corpus.reader.api import *
  29. from nltk.tokenize import *
  30. # Regular expressions for dataset components
  31. STARS = re.compile(r"^\*+$")
  32. COMPARISON = re.compile(r"<cs-[1234]>")
  33. CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
  34. GRAD_COMPARISON = re.compile(r"<cs-[123]>")
  35. NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
  36. ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
  37. KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
  38. class Comparison(object):
  39. """
  40. A Comparison represents a comparative sentence and its constituents.
  41. """
  42. def __init__(
  43. self,
  44. text=None,
  45. comp_type=None,
  46. entity_1=None,
  47. entity_2=None,
  48. feature=None,
  49. keyword=None,
  50. ):
  51. """
  52. :param text: a string (optionally tokenized) containing a comparation.
  53. :param comp_type: an integer defining the type of comparison expressed.
  54. Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
  55. 4 (Non-gradable).
  56. :param entity_1: the first entity considered in the comparison relation.
  57. :param entity_2: the second entity considered in the comparison relation.
  58. :param feature: the feature considered in the comparison relation.
  59. :param keyword: the word or phrase which is used for that comparative relation.
  60. """
  61. self.text = text
  62. self.comp_type = comp_type
  63. self.entity_1 = entity_1
  64. self.entity_2 = entity_2
  65. self.feature = feature
  66. self.keyword = keyword
  67. def __repr__(self):
  68. return (
  69. 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
  70. 'feature="{}", keyword="{}")'
  71. ).format(
  72. self.text,
  73. self.comp_type,
  74. self.entity_1,
  75. self.entity_2,
  76. self.feature,
  77. self.keyword,
  78. )
  79. class ComparativeSentencesCorpusReader(CorpusReader):
  80. """
  81. Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
  82. >>> from nltk.corpus import comparative_sentences
  83. >>> comparison = comparative_sentences.comparisons()[0]
  84. >>> comparison.text
  85. ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
  86. 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
  87. 'had', '.']
  88. >>> comparison.entity_2
  89. 'models'
  90. >>> (comparison.feature, comparison.keyword)
  91. ('rewind', 'more')
  92. >>> len(comparative_sentences.comparisons())
  93. 853
  94. """
  95. CorpusView = StreamBackedCorpusView
  96. def __init__(
  97. self,
  98. root,
  99. fileids,
  100. word_tokenizer=WhitespaceTokenizer(),
  101. sent_tokenizer=None,
  102. encoding="utf8",
  103. ):
  104. """
  105. :param root: The root directory for this corpus.
  106. :param fileids: a list or regexp specifying the fileids in this corpus.
  107. :param word_tokenizer: tokenizer for breaking sentences or paragraphs
  108. into words. Default: `WhitespaceTokenizer`
  109. :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
  110. :param encoding: the encoding that should be used to read the corpus.
  111. """
  112. CorpusReader.__init__(self, root, fileids, encoding)
  113. self._word_tokenizer = word_tokenizer
  114. self._sent_tokenizer = sent_tokenizer
  115. def comparisons(self, fileids=None):
  116. """
  117. Return all comparisons in the corpus.
  118. :param fileids: a list or regexp specifying the ids of the files whose
  119. comparisons have to be returned.
  120. :return: the given file(s) as a list of Comparison objects.
  121. :rtype: list(Comparison)
  122. """
  123. if fileids is None:
  124. fileids = self._fileids
  125. elif isinstance(fileids, str):
  126. fileids = [fileids]
  127. return concat(
  128. [
  129. self.CorpusView(path, self._read_comparison_block, encoding=enc)
  130. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  131. ]
  132. )
  133. def keywords(self, fileids=None):
  134. """
  135. Return a set of all keywords used in the corpus.
  136. :param fileids: a list or regexp specifying the ids of the files whose
  137. keywords have to be returned.
  138. :return: the set of keywords and comparative phrases used in the corpus.
  139. :rtype: set(str)
  140. """
  141. all_keywords = concat(
  142. [
  143. self.CorpusView(path, self._read_keyword_block, encoding=enc)
  144. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  145. ]
  146. )
  147. keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
  148. return keywords_set
  149. def keywords_readme(self):
  150. """
  151. Return the list of words and constituents considered as clues of a
  152. comparison (from listOfkeywords.txt).
  153. """
  154. keywords = []
  155. raw_text = self.open("listOfkeywords.txt").read()
  156. for line in raw_text.split("\n"):
  157. if not line or line.startswith("//"):
  158. continue
  159. keywords.append(line.strip())
  160. return keywords
  161. def raw(self, fileids=None):
  162. """
  163. :param fileids: a list or regexp specifying the fileids that have to be
  164. returned as a raw string.
  165. :return: the given file(s) as a single string.
  166. :rtype: str
  167. """
  168. if fileids is None:
  169. fileids = self._fileids
  170. elif isinstance(fileids, str):
  171. fileids = [fileids]
  172. return concat([self.open(f).read() for f in fileids])
  173. def readme(self):
  174. """
  175. Return the contents of the corpus readme file.
  176. """
  177. return self.open("README.txt").read()
  178. def sents(self, fileids=None):
  179. """
  180. Return all sentences in the corpus.
  181. :param fileids: a list or regexp specifying the ids of the files whose
  182. sentences have to be returned.
  183. :return: all sentences of the corpus as lists of tokens (or as plain
  184. strings, if no word tokenizer is specified).
  185. :rtype: list(list(str)) or list(str)
  186. """
  187. return concat(
  188. [
  189. self.CorpusView(path, self._read_sent_block, encoding=enc)
  190. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  191. ]
  192. )
  193. def words(self, fileids=None):
  194. """
  195. Return all words and punctuation symbols in the corpus.
  196. :param fileids: a list or regexp specifying the ids of the files whose
  197. words have to be returned.
  198. :return: the given file(s) as a list of words and punctuation symbols.
  199. :rtype: list(str)
  200. """
  201. return concat(
  202. [
  203. self.CorpusView(path, self._read_word_block, encoding=enc)
  204. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  205. ]
  206. )
  207. def _read_comparison_block(self, stream):
  208. while True:
  209. line = stream.readline()
  210. if not line:
  211. return [] # end of file.
  212. comparison_tags = re.findall(COMPARISON, line)
  213. if comparison_tags:
  214. grad_comparisons = re.findall(GRAD_COMPARISON, line)
  215. non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
  216. # Advance to the next line (it contains the comparative sentence)
  217. comparison_text = stream.readline().strip()
  218. if self._word_tokenizer:
  219. comparison_text = self._word_tokenizer.tokenize(comparison_text)
  220. # Skip the next line (it contains closing comparison tags)
  221. stream.readline()
  222. # If gradable comparisons are found, create Comparison instances
  223. # and populate their fields
  224. comparison_bundle = []
  225. if grad_comparisons:
  226. # Each comparison tag has its own relations on a separate line
  227. for comp in grad_comparisons:
  228. comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
  229. comparison = Comparison(
  230. text=comparison_text, comp_type=comp_type
  231. )
  232. line = stream.readline()
  233. entities_feats = ENTITIES_FEATS.findall(line)
  234. if entities_feats:
  235. for (code, entity_feat) in entities_feats:
  236. if code == "1":
  237. comparison.entity_1 = entity_feat.strip()
  238. elif code == "2":
  239. comparison.entity_2 = entity_feat.strip()
  240. elif code == "3":
  241. comparison.feature = entity_feat.strip()
  242. keyword = KEYWORD.findall(line)
  243. if keyword:
  244. comparison.keyword = keyword[0]
  245. comparison_bundle.append(comparison)
  246. # If non-gradable comparisons are found, create a simple Comparison
  247. # instance for each one
  248. if non_grad_comparisons:
  249. for comp in non_grad_comparisons:
  250. # comp_type in this case should always be 4.
  251. comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
  252. comparison = Comparison(
  253. text=comparison_text, comp_type=comp_type
  254. )
  255. comparison_bundle.append(comparison)
  256. # Flatten the list of comparisons before returning them
  257. # return concat([comparison_bundle])
  258. return comparison_bundle
  259. def _read_keyword_block(self, stream):
  260. keywords = []
  261. for comparison in self._read_comparison_block(stream):
  262. keywords.append(comparison.keyword)
  263. return keywords
  264. def _read_sent_block(self, stream):
  265. while True:
  266. line = stream.readline()
  267. if re.match(STARS, line):
  268. while True:
  269. line = stream.readline()
  270. if re.match(STARS, line):
  271. break
  272. continue
  273. if (
  274. not re.findall(COMPARISON, line)
  275. and not ENTITIES_FEATS.findall(line)
  276. and not re.findall(CLOSE_COMPARISON, line)
  277. ):
  278. if self._sent_tokenizer:
  279. return [
  280. self._word_tokenizer.tokenize(sent)
  281. for sent in self._sent_tokenizer.tokenize(line)
  282. ]
  283. else:
  284. return [self._word_tokenizer.tokenize(line)]
  285. def _read_word_block(self, stream):
  286. words = []
  287. for sent in self._read_sent_block(stream):
  288. words.extend(sent)
  289. return words