test_preprocessing.py 964 B

123456789101112131415161718192021222324252627282930
  1. # Natural Language Toolkit: Language Model Unit Tests
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import unittest
  8. from nltk.lm.preprocessing import padded_everygram_pipeline
  9. class TestPreprocessing(unittest.TestCase):
  10. def test_padded_everygram_pipeline(self):
  11. expected_train = [
  12. [
  13. ("<s>",),
  14. ("a",),
  15. ("b",),
  16. ("c",),
  17. ("</s>",),
  18. ("<s>", "a"),
  19. ("a", "b"),
  20. ("b", "c"),
  21. ("c", "</s>"),
  22. ]
  23. ]
  24. expected_vocab = ["<s>", "a", "b", "c", "</s>"]
  25. train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
  26. self.assertEqual([list(sent) for sent in train_data], expected_train)
  27. self.assertEqual(list(vocab_data), expected_vocab)