sentiwordnet.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: SentiWordNet
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Christopher Potts <cgpotts@stanford.edu>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. An NLTK interface for SentiWordNet
  10. SentiWordNet is a lexical resource for opinion mining.
  11. SentiWordNet assigns to each synset of WordNet three
  12. sentiment scores: positivity, negativity, and objectivity.
  13. For details about SentiWordNet see:
  14. http://sentiwordnet.isti.cnr.it/
  15. >>> from nltk.corpus import sentiwordnet as swn
  16. >>> print(swn.senti_synset('breakdown.n.03'))
  17. <breakdown.n.03: PosScore=0.0 NegScore=0.25>
  18. >>> list(swn.senti_synsets('slow'))
  19. [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
  20. SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
  21. SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
  22. SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
  23. SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
  24. SentiSynset('behind.r.03')]
  25. >>> happy = swn.senti_synsets('happy', 'a')
  26. >>> happy0 = list(happy)[0]
  27. >>> happy0.pos_score()
  28. 0.875
  29. >>> happy0.neg_score()
  30. 0.0
  31. >>> happy0.obj_score()
  32. 0.125
  33. """
  34. import re
  35. from nltk.corpus.reader import CorpusReader
  36. class SentiWordNetCorpusReader(CorpusReader):
  37. def __init__(self, root, fileids, encoding="utf-8"):
  38. """
  39. Construct a new SentiWordNet Corpus Reader, using data from
  40. the specified file.
  41. """
  42. super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
  43. if len(self._fileids) != 1:
  44. raise ValueError("Exactly one file must be specified")
  45. self._db = {}
  46. self._parse_src_file()
  47. def _parse_src_file(self):
  48. lines = self.open(self._fileids[0]).read().splitlines()
  49. lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
  50. for i, line in enumerate(lines):
  51. fields = [field.strip() for field in re.split(r"\t+", line)]
  52. try:
  53. pos, offset, pos_score, neg_score, synset_terms, gloss = fields
  54. except:
  55. raise ValueError("Line %s formatted incorrectly: %s\n" % (i, line))
  56. if pos and offset:
  57. offset = int(offset)
  58. self._db[(pos, offset)] = (float(pos_score), float(neg_score))
  59. def senti_synset(self, *vals):
  60. from nltk.corpus import wordnet as wn
  61. if tuple(vals) in self._db:
  62. pos_score, neg_score = self._db[tuple(vals)]
  63. pos, offset = vals
  64. if pos == "s":
  65. pos = "a"
  66. synset = wn.synset_from_pos_and_offset(pos, offset)
  67. return SentiSynset(pos_score, neg_score, synset)
  68. else:
  69. synset = wn.synset(vals[0])
  70. pos = synset.pos()
  71. if pos == "s":
  72. pos = "a"
  73. offset = synset.offset()
  74. if (pos, offset) in self._db:
  75. pos_score, neg_score = self._db[(pos, offset)]
  76. return SentiSynset(pos_score, neg_score, synset)
  77. else:
  78. return None
  79. def senti_synsets(self, string, pos=None):
  80. from nltk.corpus import wordnet as wn
  81. sentis = []
  82. synset_list = wn.synsets(string, pos)
  83. for synset in synset_list:
  84. sentis.append(self.senti_synset(synset.name()))
  85. sentis = filter(lambda x: x, sentis)
  86. return sentis
  87. def all_senti_synsets(self):
  88. from nltk.corpus import wordnet as wn
  89. for key, fields in self._db.items():
  90. pos, offset = key
  91. pos_score, neg_score = fields
  92. synset = wn.synset_from_pos_and_offset(pos, offset)
  93. yield SentiSynset(pos_score, neg_score, synset)
  94. class SentiSynset(object):
  95. def __init__(self, pos_score, neg_score, synset):
  96. self._pos_score = pos_score
  97. self._neg_score = neg_score
  98. self._obj_score = 1.0 - (self._pos_score + self._neg_score)
  99. self.synset = synset
  100. def pos_score(self):
  101. return self._pos_score
  102. def neg_score(self):
  103. return self._neg_score
  104. def obj_score(self):
  105. return self._obj_score
  106. def __str__(self):
  107. """Prints just the Pos/Neg scores for now."""
  108. s = "<"
  109. s += self.synset.name() + ": "
  110. s += "PosScore=%s " % self._pos_score
  111. s += "NegScore=%s" % self._neg_score
  112. s += ">"
  113. return s
  114. def __repr__(self):
  115. return "Senti" + repr(self.synset)