test_stem.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import unittest
  4. from contextlib import closing
  5. from nltk import data
  6. from nltk.stem.snowball import SnowballStemmer
  7. from nltk.stem.porter import PorterStemmer
  8. class SnowballTest(unittest.TestCase):
  9. def test_arabic(self):
  10. """
  11. this unit testing for test the snowball arabic light stemmer
  12. this stemmer deals with prefixes and suffixes
  13. """
  14. # Test where the ignore_stopwords=True.
  15. ar_stemmer = SnowballStemmer("arabic", True)
  16. assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
  17. assert ar_stemmer.stem("العربية") == "عرب"
  18. assert ar_stemmer.stem("فقالوا") == "قال"
  19. assert ar_stemmer.stem("الطالبات") == "طالب"
  20. assert ar_stemmer.stem("فالطالبات") == "طالب"
  21. assert ar_stemmer.stem("والطالبات") == "طالب"
  22. assert ar_stemmer.stem("الطالبون") == "طالب"
  23. assert ar_stemmer.stem("اللذان") == "اللذان"
  24. assert ar_stemmer.stem("من") == "من"
  25. # Test where the ignore_stopwords=False.
  26. ar_stemmer = SnowballStemmer("arabic", False)
  27. assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
  28. assert ar_stemmer.stem("الطالبات") == "طالب"
  29. assert ar_stemmer.stem("الكلمات") == "كلم"
  30. # test where create the arabic stemmer without given init value to ignore_stopwords
  31. ar_stemmer = SnowballStemmer("arabic")
  32. assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
  33. assert ar_stemmer.stem("العربية") == "عرب"
  34. assert ar_stemmer.stem("فقالوا") == "قال"
  35. assert ar_stemmer.stem("الطالبات") == "طالب"
  36. assert ar_stemmer.stem("الكلمات") == "كلم"
  37. def test_russian(self):
  38. stemmer_russian = SnowballStemmer("russian")
  39. assert stemmer_russian.stem("авантненькая") == "авантненьк"
  40. def test_german(self):
  41. stemmer_german = SnowballStemmer("german")
  42. stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
  43. assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
  44. assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
  45. assert stemmer_german.stem("keinen") == 'kein'
  46. assert stemmer_german2.stem("keinen") == 'keinen'
  47. def test_spanish(self):
  48. stemmer = SnowballStemmer('spanish')
  49. assert stemmer.stem("Visionado") == 'vision'
  50. # The word 'algue' was raising an IndexError
  51. assert stemmer.stem("algue") == 'algu'
  52. def test_short_strings_bug(self):
  53. stemmer = SnowballStemmer('english')
  54. assert stemmer.stem("y's") == 'y'
  55. class PorterTest(unittest.TestCase):
  56. def _vocabulary(self):
  57. with closing(
  58. data.find('stemmers/porter_test/porter_vocabulary.txt').open(
  59. encoding='utf-8'
  60. )
  61. ) as fp:
  62. return fp.read().splitlines()
  63. def _test_against_expected_output(self, stemmer_mode, expected_stems):
  64. stemmer = PorterStemmer(mode=stemmer_mode)
  65. for word, true_stem in zip(self._vocabulary(), expected_stems):
  66. our_stem = stemmer.stem(word)
  67. assert our_stem == true_stem, (
  68. "%s should stem to %s in %s mode but got %s"
  69. % (word, true_stem, stemmer_mode, our_stem)
  70. )
  71. def test_vocabulary_martin_mode(self):
  72. """Tests all words from the test vocabulary provided by M Porter
  73. The sample vocabulary and output were sourced from:
  74. http://tartarus.org/martin/PorterStemmer/voc.txt
  75. http://tartarus.org/martin/PorterStemmer/output.txt
  76. and are linked to from the Porter Stemmer algorithm's homepage
  77. at
  78. http://tartarus.org/martin/PorterStemmer/
  79. """
  80. with closing(
  81. data.find('stemmers/porter_test/porter_martin_output.txt').open(
  82. encoding='utf-8'
  83. )
  84. ) as fp:
  85. self._test_against_expected_output(
  86. PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
  87. )
  88. def test_vocabulary_nltk_mode(self):
  89. with closing(
  90. data.find('stemmers/porter_test/porter_nltk_output.txt').open(
  91. encoding='utf-8'
  92. )
  93. ) as fp:
  94. self._test_against_expected_output(
  95. PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
  96. )
  97. def test_vocabulary_original_mode(self):
  98. # The list of stems for this test was generated by taking the
  99. # Martin-blessed stemmer from
  100. # http://tartarus.org/martin/PorterStemmer/c.txt
  101. # and removing all the --DEPARTURE-- sections from it and
  102. # running it against Martin's test vocabulary.
  103. with closing(
  104. data.find('stemmers/porter_test/porter_original_output.txt').open(
  105. encoding='utf-8'
  106. )
  107. ) as fp:
  108. self._test_against_expected_output(
  109. PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
  110. )
  111. self._test_against_expected_output(
  112. PorterStemmer.ORIGINAL_ALGORITHM,
  113. data.find('stemmers/porter_test/porter_original_output.txt')
  114. .open(encoding='utf-8')
  115. .read()
  116. .splitlines(),
  117. )
  118. def test_oed_bug(self):
  119. """Test for bug https://github.com/nltk/nltk/issues/1581
  120. Ensures that 'oed' can be stemmed without throwing an error.
  121. """
  122. assert PorterStemmer().stem('oed') == 'o'