cistem.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: CISTEM Stemmer for German
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Leonie Weissweiler <l.weissweiler@outlook.de>
  5. # Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
  6. # Alexander Fraser <fraser@cis.lmu.de>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. import re
  10. from nltk.stem.api import StemmerI
  11. class Cistem(StemmerI):
  12. """
  13. CISTEM Stemmer for German
  14. This is the official Python implementation of the CISTEM stemmer.
  15. It is based on the paper
  16. Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German
  17. Based on a Comparative Analysis of Publicly Available Stemmers.
  18. In Proceedings of the German Society for Computational Linguistics and Language
  19. Technology (GSCL)
  20. which can be read here:
  21. http://www.cis.lmu.de/~weissweiler/cistem/
  22. In the paper, we conducted an analysis of publicly available stemmers,
  23. developed two gold standards for German stemming and evaluated the stemmers
  24. based on the two gold standards. We then proposed the stemmer implemented here
  25. and show that it achieves slightly better f-measure than the other stemmers and
  26. is thrice as fast as the Snowball stemmer for German while being about as fast
  27. as most other stemmers.
  28. case_insensitive is a a boolean specifying if case-insensitive stemming
  29. should be used. Case insensitivity improves performance only if words in the
  30. text may be incorrectly upper case. For all-lowercase and correctly cased
  31. text, best performance is achieved by setting case_insensitive for false.
  32. :param case_insensitive: if True, the stemming is case insensitive. False by default.
  33. :type case_insensitive: bool
  34. """
  35. strip_ge = re.compile(r"^ge(.{4,})")
  36. repl_xx = re.compile(r"(.)\1")
  37. strip_emr = re.compile(r"e[mr]$")
  38. strip_nd = re.compile(r"nd$")
  39. strip_t = re.compile(r"t$")
  40. strip_esn = re.compile(r"[esn]$")
  41. repl_xx_back = re.compile(r"(.)\*")
  42. def __init__(self, case_insensitive=False):
  43. self._case_insensitive = case_insensitive
  44. @staticmethod
  45. def replace_to(word):
  46. word = word.replace("sch", "$")
  47. word = word.replace("ei", "%")
  48. word = word.replace("ie", "&")
  49. word = Cistem.repl_xx.sub(r"\1*", word)
  50. return word
  51. @staticmethod
  52. def replace_back(word):
  53. word = Cistem.repl_xx_back.sub(r"\1\1", word)
  54. word = word.replace("%", "ei")
  55. word = word.replace("&", "ie")
  56. word = word.replace("$", "sch")
  57. return word
  58. def stem(self, word):
  59. """
  60. This method takes the word to be stemmed and returns the stemmed word.
  61. :param word: the word that is to be stemmed
  62. :type word: unicode
  63. :return word: the stemmed word
  64. :rtype: unicode
  65. >>> from nltk.stem.cistem import Cistem
  66. >>> stemmer = Cistem()
  67. >>> s1 = "Speicherbehältern"
  68. >>> stemmer.stem(s1)
  69. 'speicherbehalt'
  70. >>> s2 = "Grenzpostens"
  71. >>> stemmer.stem(s2)
  72. 'grenzpost'
  73. >>> s3 = "Ausgefeiltere"
  74. >>> stemmer.stem(s3)
  75. 'ausgefeilt'
  76. >>> stemmer = Cistem(True)
  77. >>> stemmer.stem(s1)
  78. 'speicherbehal'
  79. >>> stemmer.stem(s2)
  80. 'grenzpo'
  81. >>> stemmer.stem(s3)
  82. 'ausgefeil'
  83. """
  84. if len(word) == 0:
  85. return word
  86. upper = word[0].isupper()
  87. word = word.lower()
  88. word = word.replace("ü", "u")
  89. word = word.replace("ö", "o")
  90. word = word.replace("ä", "a")
  91. word = word.replace("ß", "ss")
  92. word = Cistem.strip_ge.sub(r"\1", word)
  93. word = Cistem.replace_to(word)
  94. while len(word) > 3:
  95. if len(word) > 5:
  96. (word, success) = Cistem.strip_emr.subn("", word)
  97. if success != 0:
  98. continue
  99. (word, success) = Cistem.strip_nd.subn("", word)
  100. if success != 0:
  101. continue
  102. if not upper or self._case_insensitive:
  103. (word, success) = Cistem.strip_t.subn("", word)
  104. if success != 0:
  105. continue
  106. (word, success) = Cistem.strip_esn.subn("", word)
  107. if success != 0:
  108. continue
  109. else:
  110. break
  111. word = Cistem.replace_back(word)
  112. return word
  113. def segment(self, word):
  114. """
  115. This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
  116. addition to returning the stem, it also returns the rest that was removed at
  117. the end. To be able to return the stem unchanged so the stem and the rest
  118. can be concatenated to form the original word, all subsitutions that altered
  119. the stem in any other way than by removing letters at the end were left out.
  120. :param word: the word that is to be stemmed
  121. :type word: unicode
  122. :return word: the stemmed word
  123. :rtype: unicode
  124. :return word: the removed suffix
  125. :rtype: unicode
  126. >>> from nltk.stem.cistem import Cistem
  127. >>> stemmer = Cistem()
  128. >>> s1 = "Speicherbehältern"
  129. >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
  130. ('speicherbehält', 'ern')
  131. >>> s2 = "Grenzpostens"
  132. >>> stemmer.segment(s2)
  133. ('grenzpost', 'ens')
  134. >>> s3 = "Ausgefeiltere"
  135. >>> stemmer.segment(s3)
  136. ('ausgefeilt', 'ere')
  137. >>> stemmer = Cistem(True)
  138. >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
  139. ('speicherbehäl', 'tern')
  140. >>> stemmer.segment(s2)
  141. ('grenzpo', 'stens')
  142. >>> stemmer.segment(s3)
  143. ('ausgefeil', 'tere')
  144. """
  145. rest_length = 0
  146. if len(word) == 0:
  147. return ("", "")
  148. upper = word[0].isupper()
  149. word = word.lower()
  150. original = word[:]
  151. word = Cistem.replace_to(word)
  152. while len(word) > 3:
  153. if len(word) > 5:
  154. (word, success) = Cistem.strip_emr.subn("", word)
  155. if success != 0:
  156. rest_length += 2
  157. continue
  158. (word, success) = Cistem.strip_nd.subn("", word)
  159. if success != 0:
  160. rest_length += 2
  161. continue
  162. if not upper or self._case_insensitive:
  163. (word, success) = Cistem.strip_t.subn("", word)
  164. if success != 0:
  165. rest_length += 1
  166. continue
  167. (word, success) = Cistem.strip_esn.subn("", word)
  168. if success != 0:
  169. rest_length += 1
  170. continue
  171. else:
  172. break
  173. word = Cistem.replace_back(word)
  174. if rest_length:
  175. rest = original[-rest_length:]
  176. else:
  177. rest = ""
  178. return (word, rest)