| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326 |
- # Natural Language Toolkit: Comparative Sentence Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- CorpusReader for the Comparative Sentence Dataset.
- - Comparative Sentence Dataset information -
- Annotated by: Nitin Jindal and Bing Liu, 2006.
- Department of Computer Sicence
- University of Illinois at Chicago
- Contact: Nitin Jindal, njindal@cs.uic.edu
- Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
- Distributed with permission.
- Related papers:
- - Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
- Proceedings of the ACM SIGIR International Conference on Information Retrieval
- (SIGIR-06), 2006.
- - Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
- Proceedings of Twenty First National Conference on Artificial Intelligence
- (AAAI-2006), 2006.
- - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
- Proceedings of the 22nd International Conference on Computational Linguistics
- (Coling-2008), Manchester, 18-22 August, 2008.
- """
- import re
- from nltk.corpus.reader.api import *
- from nltk.tokenize import *
- # Regular expressions for dataset components
- STARS = re.compile(r"^\*+$")
- COMPARISON = re.compile(r"<cs-[1234]>")
- CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
- GRAD_COMPARISON = re.compile(r"<cs-[123]>")
- NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
- ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
- KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
- class Comparison(object):
- """
- A Comparison represents a comparative sentence and its constituents.
- """
- def __init__(
- self,
- text=None,
- comp_type=None,
- entity_1=None,
- entity_2=None,
- feature=None,
- keyword=None,
- ):
- """
- :param text: a string (optionally tokenized) containing a comparation.
- :param comp_type: an integer defining the type of comparison expressed.
- Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
- 4 (Non-gradable).
- :param entity_1: the first entity considered in the comparison relation.
- :param entity_2: the second entity considered in the comparison relation.
- :param feature: the feature considered in the comparison relation.
- :param keyword: the word or phrase which is used for that comparative relation.
- """
- self.text = text
- self.comp_type = comp_type
- self.entity_1 = entity_1
- self.entity_2 = entity_2
- self.feature = feature
- self.keyword = keyword
- def __repr__(self):
- return (
- 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
- 'feature="{}", keyword="{}")'
- ).format(
- self.text,
- self.comp_type,
- self.entity_1,
- self.entity_2,
- self.feature,
- self.keyword,
- )
- class ComparativeSentencesCorpusReader(CorpusReader):
- """
- Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
- >>> from nltk.corpus import comparative_sentences
- >>> comparison = comparative_sentences.comparisons()[0]
- >>> comparison.text
- ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
- 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
- 'had', '.']
- >>> comparison.entity_2
- 'models'
- >>> (comparison.feature, comparison.keyword)
- ('rewind', 'more')
- >>> len(comparative_sentences.comparisons())
- 853
- """
- CorpusView = StreamBackedCorpusView
- def __init__(
- self,
- root,
- fileids,
- word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=None,
- encoding="utf8",
- ):
- """
- :param root: The root directory for this corpus.
- :param fileids: a list or regexp specifying the fileids in this corpus.
- :param word_tokenizer: tokenizer for breaking sentences or paragraphs
- into words. Default: `WhitespaceTokenizer`
- :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
- :param encoding: the encoding that should be used to read the corpus.
- """
- CorpusReader.__init__(self, root, fileids, encoding)
- self._word_tokenizer = word_tokenizer
- self._sent_tokenizer = sent_tokenizer
- def comparisons(self, fileids=None):
- """
- Return all comparisons in the corpus.
- :param fileids: a list or regexp specifying the ids of the files whose
- comparisons have to be returned.
- :return: the given file(s) as a list of Comparison objects.
- :rtype: list(Comparison)
- """
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_comparison_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
- def keywords(self, fileids=None):
- """
- Return a set of all keywords used in the corpus.
- :param fileids: a list or regexp specifying the ids of the files whose
- keywords have to be returned.
- :return: the set of keywords and comparative phrases used in the corpus.
- :rtype: set(str)
- """
- all_keywords = concat(
- [
- self.CorpusView(path, self._read_keyword_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
- keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
- return keywords_set
- def keywords_readme(self):
- """
- Return the list of words and constituents considered as clues of a
- comparison (from listOfkeywords.txt).
- """
- keywords = []
- raw_text = self.open("listOfkeywords.txt").read()
- for line in raw_text.split("\n"):
- if not line or line.startswith("//"):
- continue
- keywords.append(line.strip())
- return keywords
- def raw(self, fileids=None):
- """
- :param fileids: a list or regexp specifying the fileids that have to be
- returned as a raw string.
- :return: the given file(s) as a single string.
- :rtype: str
- """
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
- def readme(self):
- """
- Return the contents of the corpus readme file.
- """
- return self.open("README.txt").read()
- def sents(self, fileids=None):
- """
- Return all sentences in the corpus.
- :param fileids: a list or regexp specifying the ids of the files whose
- sentences have to be returned.
- :return: all sentences of the corpus as lists of tokens (or as plain
- strings, if no word tokenizer is specified).
- :rtype: list(list(str)) or list(str)
- """
- return concat(
- [
- self.CorpusView(path, self._read_sent_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
- def words(self, fileids=None):
- """
- Return all words and punctuation symbols in the corpus.
- :param fileids: a list or regexp specifying the ids of the files whose
- words have to be returned.
- :return: the given file(s) as a list of words and punctuation symbols.
- :rtype: list(str)
- """
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
- def _read_comparison_block(self, stream):
- while True:
- line = stream.readline()
- if not line:
- return [] # end of file.
- comparison_tags = re.findall(COMPARISON, line)
- if comparison_tags:
- grad_comparisons = re.findall(GRAD_COMPARISON, line)
- non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
- # Advance to the next line (it contains the comparative sentence)
- comparison_text = stream.readline().strip()
- if self._word_tokenizer:
- comparison_text = self._word_tokenizer.tokenize(comparison_text)
- # Skip the next line (it contains closing comparison tags)
- stream.readline()
- # If gradable comparisons are found, create Comparison instances
- # and populate their fields
- comparison_bundle = []
- if grad_comparisons:
- # Each comparison tag has its own relations on a separate line
- for comp in grad_comparisons:
- comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
- comparison = Comparison(
- text=comparison_text, comp_type=comp_type
- )
- line = stream.readline()
- entities_feats = ENTITIES_FEATS.findall(line)
- if entities_feats:
- for (code, entity_feat) in entities_feats:
- if code == "1":
- comparison.entity_1 = entity_feat.strip()
- elif code == "2":
- comparison.entity_2 = entity_feat.strip()
- elif code == "3":
- comparison.feature = entity_feat.strip()
- keyword = KEYWORD.findall(line)
- if keyword:
- comparison.keyword = keyword[0]
- comparison_bundle.append(comparison)
- # If non-gradable comparisons are found, create a simple Comparison
- # instance for each one
- if non_grad_comparisons:
- for comp in non_grad_comparisons:
- # comp_type in this case should always be 4.
- comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
- comparison = Comparison(
- text=comparison_text, comp_type=comp_type
- )
- comparison_bundle.append(comparison)
- # Flatten the list of comparisons before returning them
- # return concat([comparison_bundle])
- return comparison_bundle
- def _read_keyword_block(self, stream):
- keywords = []
- for comparison in self._read_comparison_block(stream):
- keywords.append(comparison.keyword)
- return keywords
- def _read_sent_block(self, stream):
- while True:
- line = stream.readline()
- if re.match(STARS, line):
- while True:
- line = stream.readline()
- if re.match(STARS, line):
- break
- continue
- if (
- not re.findall(COMPARISON, line)
- and not ENTITIES_FEATS.findall(line)
- and not re.findall(CLOSE_COMPARISON, line)
- ):
- if self._sent_tokenizer:
- return [
- self._word_tokenizer.tokenize(sent)
- for sent in self._sent_tokenizer.tokenize(line)
- ]
- else:
- return [self._word_tokenizer.tokenize(line)]
- def _read_word_block(self, stream):
- words = []
- for sent in self._read_sent_block(stream):
- words.extend(sent)
- return words
|