regexp.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. # Natural Language Toolkit: Stemmers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # Edward Loper <edloper@gmail.com>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. import re
  10. from nltk.stem.api import StemmerI
  11. class RegexpStemmer(StemmerI):
  12. """
  13. A stemmer that uses regular expressions to identify morphological
  14. affixes. Any substrings that match the regular expressions will
  15. be removed.
  16. >>> from nltk.stem import RegexpStemmer
  17. >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4)
  18. >>> st.stem('cars')
  19. 'car'
  20. >>> st.stem('mass')
  21. 'mas'
  22. >>> st.stem('was')
  23. 'was'
  24. >>> st.stem('bee')
  25. 'bee'
  26. >>> st.stem('compute')
  27. 'comput'
  28. >>> st.stem('advisable')
  29. 'advis'
  30. :type regexp: str or regexp
  31. :param regexp: The regular expression that should be used to
  32. identify morphological affixes.
  33. :type min: int
  34. :param min: The minimum length of string to stem
  35. """
  36. def __init__(self, regexp, min=0):
  37. if not hasattr(regexp, "pattern"):
  38. regexp = re.compile(regexp)
  39. self._regexp = regexp
  40. self._min = min
  41. def stem(self, word):
  42. if len(word) < self._min:
  43. return word
  44. else:
  45. return self._regexp.sub("", word)
  46. def __repr__(self):
  47. return "<RegexpStemmer: {!r}>".format(self._regexp.pattern)