smoothing.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. # Natural Language Toolkit: Language Model Unit Tests
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """Smoothing algorithms for language modeling.
  8. According to Chen & Goodman 1995 these should work with both Backoff and
  9. Interpolation.
  10. """
  11. from nltk.lm.api import Smoothing
  12. def _count_non_zero_vals(dictionary):
  13. return sum(1.0 for c in dictionary.values() if c > 0)
  14. class WittenBell(Smoothing):
  15. """Witten-Bell smoothing."""
  16. def __init__(self, vocabulary, counter, **kwargs):
  17. super().__init__(vocabulary, counter, **kwargs)
  18. def alpha_gamma(self, word, context):
  19. alpha = self.counts[context].freq(word)
  20. gamma = self._gamma(context)
  21. return (1.0 - gamma) * alpha, gamma
  22. def _gamma(self, context):
  23. n_plus = _count_non_zero_vals(self.counts[context])
  24. return n_plus / (n_plus + self.counts[len(context) + 1].N())
  25. def unigram_score(self, word):
  26. return self.counts.unigrams.freq(word)
  27. class KneserNey(Smoothing):
  28. """Kneser-Ney Smoothing."""
  29. def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
  30. super().__init__(vocabulary, counter, **kwargs)
  31. self.discount = discount
  32. def unigram_score(self, word):
  33. return 1.0 / len(self.vocab)
  34. def alpha_gamma(self, word, context):
  35. prefix_counts = self.counts[context]
  36. prefix_total_ngrams = prefix_counts.N()
  37. alpha = max(prefix_counts[word] - self.discount, 0.0) / prefix_total_ngrams
  38. gamma = (
  39. self.discount * _count_non_zero_vals(prefix_counts) / prefix_total_ngrams
  40. )
  41. return alpha, gamma