models.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # Natural Language Toolkit: Language Models
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """Language Models"""
  8. from nltk.lm.api import LanguageModel, Smoothing
  9. from nltk.lm.smoothing import KneserNey, WittenBell
  10. class MLE(LanguageModel):
  11. """Class for providing MLE ngram model scores.
  12. Inherits initialization from BaseNgramModel.
  13. """
  14. def unmasked_score(self, word, context=None):
  15. """Returns the MLE score for a word given a context.
  16. Args:
  17. - word is expcected to be a string
  18. - context is expected to be something reasonably convertible to a tuple
  19. """
  20. return self.context_counts(context).freq(word)
  21. class Lidstone(LanguageModel):
  22. """Provides Lidstone-smoothed scores.
  23. In addition to initialization arguments from BaseNgramModel also requires
  24. a number by which to increase the counts, gamma.
  25. """
  26. def __init__(self, gamma, *args, **kwargs):
  27. super().__init__(*args, **kwargs)
  28. self.gamma = gamma
  29. def unmasked_score(self, word, context=None):
  30. """Add-one smoothing: Lidstone or Laplace.
  31. To see what kind, look at `gamma` attribute on the class.
  32. """
  33. counts = self.context_counts(context)
  34. word_count = counts[word]
  35. norm_count = counts.N()
  36. return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
  37. class Laplace(Lidstone):
  38. """Implements Laplace (add one) smoothing.
  39. Initialization identical to BaseNgramModel because gamma is always 1.
  40. """
  41. def __init__(self, *args, **kwargs):
  42. super().__init__(1, *args, **kwargs)
  43. class InterpolatedLanguageModel(LanguageModel):
  44. """Logic common to all interpolated language models.
  45. The idea to abstract this comes from Chen & Goodman 1995.
  46. Do not instantiate this class directly!
  47. """
  48. def __init__(self, smoothing_cls, order, **kwargs):
  49. assert issubclass(smoothing_cls, Smoothing)
  50. params = kwargs.pop("params", {})
  51. super().__init__(order, **kwargs)
  52. self.estimator = smoothing_cls(self.vocab, self.counts, **params)
  53. def unmasked_score(self, word, context=None):
  54. if not context:
  55. # The base recursion case: no context, we only have a unigram.
  56. return self.estimator.unigram_score(word)
  57. if not self.counts[context]:
  58. # It can also happen that we have no data for this context.
  59. # In that case we defer to the lower-order ngram.
  60. # This is the same as setting alpha to 0 and gamma to 1.
  61. return self.unmasked_score(word, context[1:])
  62. alpha, gamma = self.estimator.alpha_gamma(word, context)
  63. return alpha + gamma * self.unmasked_score(word, context[1:])
  64. class WittenBellInterpolated(InterpolatedLanguageModel):
  65. """Interpolated version of Witten-Bell smoothing."""
  66. def __init__(self, order, **kwargs):
  67. super().__init__(WittenBell, order, **kwargs)
  68. class KneserNeyInterpolated(InterpolatedLanguageModel):
  69. """Interpolated version of Kneser-Ney smoothing."""
  70. def __init__(self, order, discount=0.1, **kwargs):
  71. super().__init__(KneserNey, order, params={"discount": discount}, **kwargs)