| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: SentiWordNet
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Christopher Potts <cgpotts@stanford.edu>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- An NLTK interface for SentiWordNet
- SentiWordNet is a lexical resource for opinion mining.
- SentiWordNet assigns to each synset of WordNet three
- sentiment scores: positivity, negativity, and objectivity.
- For details about SentiWordNet see:
- http://sentiwordnet.isti.cnr.it/
- >>> from nltk.corpus import sentiwordnet as swn
- >>> print(swn.senti_synset('breakdown.n.03'))
- <breakdown.n.03: PosScore=0.0 NegScore=0.25>
- >>> list(swn.senti_synsets('slow'))
- [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
- SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
- SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
- SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
- SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
- SentiSynset('behind.r.03')]
- >>> happy = swn.senti_synsets('happy', 'a')
- >>> happy0 = list(happy)[0]
- >>> happy0.pos_score()
- 0.875
- >>> happy0.neg_score()
- 0.0
- >>> happy0.obj_score()
- 0.125
- """
- import re
- from nltk.corpus.reader import CorpusReader
- class SentiWordNetCorpusReader(CorpusReader):
- def __init__(self, root, fileids, encoding="utf-8"):
- """
- Construct a new SentiWordNet Corpus Reader, using data from
- the specified file.
- """
- super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
- if len(self._fileids) != 1:
- raise ValueError("Exactly one file must be specified")
- self._db = {}
- self._parse_src_file()
- def _parse_src_file(self):
- lines = self.open(self._fileids[0]).read().splitlines()
- lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
- for i, line in enumerate(lines):
- fields = [field.strip() for field in re.split(r"\t+", line)]
- try:
- pos, offset, pos_score, neg_score, synset_terms, gloss = fields
- except:
- raise ValueError("Line %s formatted incorrectly: %s\n" % (i, line))
- if pos and offset:
- offset = int(offset)
- self._db[(pos, offset)] = (float(pos_score), float(neg_score))
- def senti_synset(self, *vals):
- from nltk.corpus import wordnet as wn
- if tuple(vals) in self._db:
- pos_score, neg_score = self._db[tuple(vals)]
- pos, offset = vals
- if pos == "s":
- pos = "a"
- synset = wn.synset_from_pos_and_offset(pos, offset)
- return SentiSynset(pos_score, neg_score, synset)
- else:
- synset = wn.synset(vals[0])
- pos = synset.pos()
- if pos == "s":
- pos = "a"
- offset = synset.offset()
- if (pos, offset) in self._db:
- pos_score, neg_score = self._db[(pos, offset)]
- return SentiSynset(pos_score, neg_score, synset)
- else:
- return None
- def senti_synsets(self, string, pos=None):
- from nltk.corpus import wordnet as wn
- sentis = []
- synset_list = wn.synsets(string, pos)
- for synset in synset_list:
- sentis.append(self.senti_synset(synset.name()))
- sentis = filter(lambda x: x, sentis)
- return sentis
- def all_senti_synsets(self):
- from nltk.corpus import wordnet as wn
- for key, fields in self._db.items():
- pos, offset = key
- pos_score, neg_score = fields
- synset = wn.synset_from_pos_and_offset(pos, offset)
- yield SentiSynset(pos_score, neg_score, synset)
- class SentiSynset(object):
- def __init__(self, pos_score, neg_score, synset):
- self._pos_score = pos_score
- self._neg_score = neg_score
- self._obj_score = 1.0 - (self._pos_score + self._neg_score)
- self.synset = synset
- def pos_score(self):
- return self._pos_score
- def neg_score(self):
- return self._neg_score
- def obj_score(self):
- return self._obj_score
- def __str__(self):
- """Prints just the Pos/Neg scores for now."""
- s = "<"
- s += self.synset.name() + ": "
- s += "PosScore=%s " % self._pos_score
- s += "NegScore=%s" % self._neg_score
- s += ">"
- return s
- def __repr__(self):
- return "Senti" + repr(self.synset)
|