vader.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. # coding: utf-8
  2. # Natural Language Toolkit: vader
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
  6. # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
  7. # Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
  8. # George Berry <geb97@cornell.edu> (modifications)
  9. # Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
  10. # URL: <http://nltk.org/>
  11. # For license information, see LICENSE.TXT
  12. #
  13. # Modifications to the original VADER code have been made in order to
  14. # integrate it into NLTK. These have involved changes to
  15. # ensure Python 3 compatibility, and refactoring to achieve greater modularity.
  16. """
  17. If you use the VADER sentiment analysis tools, please cite:
  18. Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
  19. Sentiment Analysis of Social Media Text. Eighth International Conference on
  20. Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
  21. """
  22. import math
  23. import re
  24. import string
  25. from itertools import product
  26. import nltk.data
  27. from nltk.util import pairwise
  28. class VaderConstants:
  29. """
  30. A class to keep the Vader lists and constants.
  31. """
  32. ##Constants##
  33. # (empirically derived mean sentiment intensity rating increase for booster words)
  34. B_INCR = 0.293
  35. B_DECR = -0.293
  36. # (empirically derived mean sentiment intensity rating increase for using
  37. # ALLCAPs to emphasize a word)
  38. C_INCR = 0.733
  39. N_SCALAR = -0.74
  40. NEGATE = {
  41. "aint",
  42. "arent",
  43. "cannot",
  44. "cant",
  45. "couldnt",
  46. "darent",
  47. "didnt",
  48. "doesnt",
  49. "ain't",
  50. "aren't",
  51. "can't",
  52. "couldn't",
  53. "daren't",
  54. "didn't",
  55. "doesn't",
  56. "dont",
  57. "hadnt",
  58. "hasnt",
  59. "havent",
  60. "isnt",
  61. "mightnt",
  62. "mustnt",
  63. "neither",
  64. "don't",
  65. "hadn't",
  66. "hasn't",
  67. "haven't",
  68. "isn't",
  69. "mightn't",
  70. "mustn't",
  71. "neednt",
  72. "needn't",
  73. "never",
  74. "none",
  75. "nope",
  76. "nor",
  77. "not",
  78. "nothing",
  79. "nowhere",
  80. "oughtnt",
  81. "shant",
  82. "shouldnt",
  83. "uhuh",
  84. "wasnt",
  85. "werent",
  86. "oughtn't",
  87. "shan't",
  88. "shouldn't",
  89. "uh-uh",
  90. "wasn't",
  91. "weren't",
  92. "without",
  93. "wont",
  94. "wouldnt",
  95. "won't",
  96. "wouldn't",
  97. "rarely",
  98. "seldom",
  99. "despite",
  100. }
  101. # booster/dampener 'intensifiers' or 'degree adverbs'
  102. # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
  103. BOOSTER_DICT = {
  104. "absolutely": B_INCR,
  105. "amazingly": B_INCR,
  106. "awfully": B_INCR,
  107. "completely": B_INCR,
  108. "considerably": B_INCR,
  109. "decidedly": B_INCR,
  110. "deeply": B_INCR,
  111. "effing": B_INCR,
  112. "enormously": B_INCR,
  113. "entirely": B_INCR,
  114. "especially": B_INCR,
  115. "exceptionally": B_INCR,
  116. "extremely": B_INCR,
  117. "fabulously": B_INCR,
  118. "flipping": B_INCR,
  119. "flippin": B_INCR,
  120. "fricking": B_INCR,
  121. "frickin": B_INCR,
  122. "frigging": B_INCR,
  123. "friggin": B_INCR,
  124. "fully": B_INCR,
  125. "fucking": B_INCR,
  126. "greatly": B_INCR,
  127. "hella": B_INCR,
  128. "highly": B_INCR,
  129. "hugely": B_INCR,
  130. "incredibly": B_INCR,
  131. "intensely": B_INCR,
  132. "majorly": B_INCR,
  133. "more": B_INCR,
  134. "most": B_INCR,
  135. "particularly": B_INCR,
  136. "purely": B_INCR,
  137. "quite": B_INCR,
  138. "really": B_INCR,
  139. "remarkably": B_INCR,
  140. "so": B_INCR,
  141. "substantially": B_INCR,
  142. "thoroughly": B_INCR,
  143. "totally": B_INCR,
  144. "tremendously": B_INCR,
  145. "uber": B_INCR,
  146. "unbelievably": B_INCR,
  147. "unusually": B_INCR,
  148. "utterly": B_INCR,
  149. "very": B_INCR,
  150. "almost": B_DECR,
  151. "barely": B_DECR,
  152. "hardly": B_DECR,
  153. "just enough": B_DECR,
  154. "kind of": B_DECR,
  155. "kinda": B_DECR,
  156. "kindof": B_DECR,
  157. "kind-of": B_DECR,
  158. "less": B_DECR,
  159. "little": B_DECR,
  160. "marginally": B_DECR,
  161. "occasionally": B_DECR,
  162. "partly": B_DECR,
  163. "scarcely": B_DECR,
  164. "slightly": B_DECR,
  165. "somewhat": B_DECR,
  166. "sort of": B_DECR,
  167. "sorta": B_DECR,
  168. "sortof": B_DECR,
  169. "sort-of": B_DECR,
  170. }
  171. # check for special case idioms using a sentiment-laden keyword known to SAGE
  172. SPECIAL_CASE_IDIOMS = {
  173. "the shit": 3,
  174. "the bomb": 3,
  175. "bad ass": 1.5,
  176. "yeah right": -2,
  177. "cut the mustard": 2,
  178. "kiss of death": -1.5,
  179. "hand to mouth": -2,
  180. }
  181. # for removing punctuation
  182. REGEX_REMOVE_PUNCTUATION = re.compile("[{0}]".format(re.escape(string.punctuation)))
  183. PUNC_LIST = [
  184. ".",
  185. "!",
  186. "?",
  187. ",",
  188. ";",
  189. ":",
  190. "-",
  191. "'",
  192. '"',
  193. "!!",
  194. "!!!",
  195. "??",
  196. "???",
  197. "?!?",
  198. "!?!",
  199. "?!?!",
  200. "!?!?",
  201. ]
  202. def __init__(self):
  203. pass
  204. def negated(self, input_words, include_nt=True):
  205. """
  206. Determine if input contains negation words
  207. """
  208. neg_words = self.NEGATE
  209. if any(word.lower() in neg_words for word in input_words):
  210. return True
  211. if include_nt:
  212. if any("n't" in word.lower() for word in input_words):
  213. return True
  214. for first, second in pairwise(input_words):
  215. if second.lower() == "least" and first.lower() != "at":
  216. return True
  217. return False
  218. def normalize(self, score, alpha=15):
  219. """
  220. Normalize the score to be between -1 and 1 using an alpha that
  221. approximates the max expected value
  222. """
  223. norm_score = score / math.sqrt((score * score) + alpha)
  224. return norm_score
  225. def scalar_inc_dec(self, word, valence, is_cap_diff):
  226. """
  227. Check if the preceding words increase, decrease, or negate/nullify the
  228. valence
  229. """
  230. scalar = 0.0
  231. word_lower = word.lower()
  232. if word_lower in self.BOOSTER_DICT:
  233. scalar = self.BOOSTER_DICT[word_lower]
  234. if valence < 0:
  235. scalar *= -1
  236. # check if booster/dampener word is in ALLCAPS (while others aren't)
  237. if word.isupper() and is_cap_diff:
  238. if valence > 0:
  239. scalar += self.C_INCR
  240. else:
  241. scalar -= self.C_INCR
  242. return scalar
  243. class SentiText:
  244. """
  245. Identify sentiment-relevant string-level properties of input text.
  246. """
  247. def __init__(self, text, punc_list, regex_remove_punctuation):
  248. if not isinstance(text, str):
  249. text = str(text.encode("utf-8"))
  250. self.text = text
  251. self.PUNC_LIST = punc_list
  252. self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
  253. self.words_and_emoticons = self._words_and_emoticons()
  254. # doesn't separate words from
  255. # adjacent punctuation (keeps emoticons & contractions)
  256. self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
  257. def _words_plus_punc(self):
  258. """
  259. Returns mapping of form:
  260. {
  261. 'cat,': 'cat',
  262. ',cat': 'cat',
  263. }
  264. """
  265. no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
  266. # removes punctuation (but loses emoticons & contractions)
  267. words_only = no_punc_text.split()
  268. # remove singletons
  269. words_only = set(w for w in words_only if len(w) > 1)
  270. # the product gives ('cat', ',') and (',', 'cat')
  271. punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
  272. punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
  273. words_punc_dict = punc_before
  274. words_punc_dict.update(punc_after)
  275. return words_punc_dict
  276. def _words_and_emoticons(self):
  277. """
  278. Removes leading and trailing puncutation
  279. Leaves contractions and most emoticons
  280. Does not preserve punc-plus-letter emoticons (e.g. :D)
  281. """
  282. wes = self.text.split()
  283. words_punc_dict = self._words_plus_punc()
  284. wes = [we for we in wes if len(we) > 1]
  285. for i, we in enumerate(wes):
  286. if we in words_punc_dict:
  287. wes[i] = words_punc_dict[we]
  288. return wes
  289. def allcap_differential(self, words):
  290. """
  291. Check whether just some words in the input are ALL CAPS
  292. :param list words: The words to inspect
  293. :returns: `True` if some but not all items in `words` are ALL CAPS
  294. """
  295. is_different = False
  296. allcap_words = 0
  297. for word in words:
  298. if word.isupper():
  299. allcap_words += 1
  300. cap_differential = len(words) - allcap_words
  301. if 0 < cap_differential < len(words):
  302. is_different = True
  303. return is_different
  304. class SentimentIntensityAnalyzer:
  305. """
  306. Give a sentiment intensity score to sentences.
  307. """
  308. def __init__(
  309. self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
  310. ):
  311. self.lexicon_file = nltk.data.load(lexicon_file)
  312. self.lexicon = self.make_lex_dict()
  313. self.constants = VaderConstants()
  314. def make_lex_dict(self):
  315. """
  316. Convert lexicon file to a dictionary
  317. """
  318. lex_dict = {}
  319. for line in self.lexicon_file.split("\n"):
  320. (word, measure) = line.strip().split("\t")[0:2]
  321. lex_dict[word] = float(measure)
  322. return lex_dict
  323. def polarity_scores(self, text):
  324. """
  325. Return a float for sentiment strength based on the input text.
  326. Positive values are positive valence, negative value are negative
  327. valence.
  328. """
  329. # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
  330. sentitext = SentiText(text, self.constants.PUNC_LIST,
  331. self.constants.REGEX_REMOVE_PUNCTUATION)
  332. sentiments = []
  333. words_and_emoticons = sentitext.words_and_emoticons
  334. for item in words_and_emoticons:
  335. valence = 0
  336. i = words_and_emoticons.index(item)
  337. if (
  338. i < len(words_and_emoticons) - 1
  339. and item.lower() == "kind"
  340. and words_and_emoticons[i + 1].lower() == "of"
  341. ) or item.lower() in self.constants.BOOSTER_DICT:
  342. sentiments.append(valence)
  343. continue
  344. sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
  345. sentiments = self._but_check(words_and_emoticons, sentiments)
  346. return self.score_valence(sentiments, text)
  347. def sentiment_valence(self, valence, sentitext, item, i, sentiments):
  348. is_cap_diff = sentitext.is_cap_diff
  349. words_and_emoticons = sentitext.words_and_emoticons
  350. item_lowercase = item.lower()
  351. if item_lowercase in self.lexicon:
  352. # get the sentiment valence
  353. valence = self.lexicon[item_lowercase]
  354. # check if sentiment laden word is in ALL CAPS (while others aren't)
  355. if item.isupper() and is_cap_diff:
  356. if valence > 0:
  357. valence += self.constants.C_INCR
  358. else:
  359. valence -= self.constants.C_INCR
  360. for start_i in range(0, 3):
  361. if (
  362. i > start_i
  363. and words_and_emoticons[i - (start_i + 1)].lower()
  364. not in self.lexicon
  365. ):
  366. # dampen the scalar modifier of preceding words and emoticons
  367. # (excluding the ones that immediately preceed the item) based
  368. # on their distance from the current item.
  369. s = self.constants.scalar_inc_dec(
  370. words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
  371. )
  372. if start_i == 1 and s != 0:
  373. s = s * 0.95
  374. if start_i == 2 and s != 0:
  375. s = s * 0.9
  376. valence = valence + s
  377. valence = self._never_check(
  378. valence, words_and_emoticons, start_i, i
  379. )
  380. if start_i == 2:
  381. valence = self._idioms_check(valence, words_and_emoticons, i)
  382. # future work: consider other sentiment-laden idioms
  383. # other_idioms =
  384. # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
  385. # "upper hand": 1, "break a leg": 2,
  386. # "cooking with gas": 2, "in the black": 2, "in the red": -2,
  387. # "on the ball": 2,"under the weather": -2}
  388. valence = self._least_check(valence, words_and_emoticons, i)
  389. sentiments.append(valence)
  390. return sentiments
  391. def _least_check(self, valence, words_and_emoticons, i):
  392. # check for negation case using "least"
  393. if (
  394. i > 1
  395. and words_and_emoticons[i - 1].lower() not in self.lexicon
  396. and words_and_emoticons[i - 1].lower() == "least"
  397. ):
  398. if (
  399. words_and_emoticons[i - 2].lower() != "at"
  400. and words_and_emoticons[i - 2].lower() != "very"
  401. ):
  402. valence = valence * self.constants.N_SCALAR
  403. elif (
  404. i > 0
  405. and words_and_emoticons[i - 1].lower() not in self.lexicon
  406. and words_and_emoticons[i - 1].lower() == "least"
  407. ):
  408. valence = valence * self.constants.N_SCALAR
  409. return valence
  410. def _but_check(self, words_and_emoticons, sentiments):
  411. but = {"but", "BUT"} & set(words_and_emoticons)
  412. if but:
  413. bi = words_and_emoticons.index(next(iter(but)))
  414. for sidx, sentiment in enumerate(sentiments):
  415. if sidx < bi:
  416. sentiments[sidx] = sentiment * 0.5
  417. elif sidx > bi:
  418. sentiments[sidx] = sentiment * 1.5
  419. return sentiments
  420. def _idioms_check(self, valence, words_and_emoticons, i):
  421. onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i])
  422. twoonezero = "{0} {1} {2}".format(
  423. words_and_emoticons[i - 2],
  424. words_and_emoticons[i - 1],
  425. words_and_emoticons[i],
  426. )
  427. twoone = "{0} {1}".format(
  428. words_and_emoticons[i - 2], words_and_emoticons[i - 1]
  429. )
  430. threetwoone = "{0} {1} {2}".format(
  431. words_and_emoticons[i - 3],
  432. words_and_emoticons[i - 2],
  433. words_and_emoticons[i - 1],
  434. )
  435. threetwo = "{0} {1}".format(
  436. words_and_emoticons[i - 3], words_and_emoticons[i - 2]
  437. )
  438. sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
  439. for seq in sequences:
  440. if seq in self.constants.SPECIAL_CASE_IDIOMS:
  441. valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
  442. break
  443. if len(words_and_emoticons) - 1 > i:
  444. zeroone = "{0} {1}".format(
  445. words_and_emoticons[i], words_and_emoticons[i + 1]
  446. )
  447. if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
  448. valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
  449. if len(words_and_emoticons) - 1 > i + 1:
  450. zeroonetwo = "{0} {1} {2}".format(
  451. words_and_emoticons[i],
  452. words_and_emoticons[i + 1],
  453. words_and_emoticons[i + 2],
  454. )
  455. if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
  456. valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
  457. # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
  458. if threetwo in self.constants.BOOSTER_DICT or twoone in self.constants.BOOSTER_DICT:
  459. valence = valence + self.constants.B_DECR
  460. return valence
  461. def _never_check(self, valence, words_and_emoticons, start_i, i):
  462. if start_i == 0:
  463. if self.constants.negated([words_and_emoticons[i - 1]]):
  464. valence = valence * self.constants.N_SCALAR
  465. if start_i == 1:
  466. if words_and_emoticons[i - 2] == "never" and (
  467. words_and_emoticons[i - 1] == "so"
  468. or words_and_emoticons[i - 1] == "this"
  469. ):
  470. valence = valence * 1.5
  471. elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
  472. valence = valence * self.constants.N_SCALAR
  473. if start_i == 2:
  474. if (
  475. words_and_emoticons[i - 3] == "never"
  476. and (
  477. words_and_emoticons[i - 2] == "so"
  478. or words_and_emoticons[i - 2] == "this"
  479. )
  480. or (
  481. words_and_emoticons[i - 1] == "so"
  482. or words_and_emoticons[i - 1] == "this"
  483. )
  484. ):
  485. valence = valence * 1.25
  486. elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
  487. valence = valence * self.constants.N_SCALAR
  488. return valence
  489. def _punctuation_emphasis(self, sum_s, text):
  490. # add emphasis from exclamation points and question marks
  491. ep_amplifier = self._amplify_ep(text)
  492. qm_amplifier = self._amplify_qm(text)
  493. punct_emph_amplifier = ep_amplifier + qm_amplifier
  494. return punct_emph_amplifier
  495. def _amplify_ep(self, text):
  496. # check for added emphasis resulting from exclamation points (up to 4 of them)
  497. ep_count = text.count("!")
  498. if ep_count > 4:
  499. ep_count = 4
  500. # (empirically derived mean sentiment intensity rating increase for
  501. # exclamation points)
  502. ep_amplifier = ep_count * 0.292
  503. return ep_amplifier
  504. def _amplify_qm(self, text):
  505. # check for added emphasis resulting from question marks (2 or 3+)
  506. qm_count = text.count("?")
  507. qm_amplifier = 0
  508. if qm_count > 1:
  509. if qm_count <= 3:
  510. # (empirically derived mean sentiment intensity rating increase for
  511. # question marks)
  512. qm_amplifier = qm_count * 0.18
  513. else:
  514. qm_amplifier = 0.96
  515. return qm_amplifier
  516. def _sift_sentiment_scores(self, sentiments):
  517. # want separate positive versus negative sentiment scores
  518. pos_sum = 0.0
  519. neg_sum = 0.0
  520. neu_count = 0
  521. for sentiment_score in sentiments:
  522. if sentiment_score > 0:
  523. pos_sum += (
  524. float(sentiment_score) + 1
  525. ) # compensates for neutral words that are counted as 1
  526. if sentiment_score < 0:
  527. neg_sum += (
  528. float(sentiment_score) - 1
  529. ) # when used with math.fabs(), compensates for neutrals
  530. if sentiment_score == 0:
  531. neu_count += 1
  532. return pos_sum, neg_sum, neu_count
  533. def score_valence(self, sentiments, text):
  534. if sentiments:
  535. sum_s = float(sum(sentiments))
  536. # compute and add emphasis from punctuation in text
  537. punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
  538. if sum_s > 0:
  539. sum_s += punct_emph_amplifier
  540. elif sum_s < 0:
  541. sum_s -= punct_emph_amplifier
  542. compound = self.constants.normalize(sum_s)
  543. # discriminate between positive, negative and neutral sentiment scores
  544. pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
  545. if pos_sum > math.fabs(neg_sum):
  546. pos_sum += punct_emph_amplifier
  547. elif pos_sum < math.fabs(neg_sum):
  548. neg_sum -= punct_emph_amplifier
  549. total = pos_sum + math.fabs(neg_sum) + neu_count
  550. pos = math.fabs(pos_sum / total)
  551. neg = math.fabs(neg_sum / total)
  552. neu = math.fabs(neu_count / total)
  553. else:
  554. compound = 0.0
  555. pos = 0.0
  556. neg = 0.0
  557. neu = 0.0
  558. sentiment_dict = {
  559. "neg": round(neg, 3),
  560. "neu": round(neu, 3),
  561. "pos": round(pos, 3),
  562. "compound": round(compound, 4),
  563. }
  564. return sentiment_dict