stemmer.py 865 B

123456789101112131415161718192021222324252627
  1. def get_language_stemmer(language):
  2. """Retrieves the SnowballStemmer for a particular language.
  3. Args:
  4. language (str): ISO-639-1 code of the language.
  5. """
  6. from lunr.languages import SUPPORTED_LANGUAGES
  7. from nltk.stem.snowball import SnowballStemmer
  8. return SnowballStemmer(SUPPORTED_LANGUAGES[language])
  9. def nltk_stemmer(stemmer, token, i=None, tokens=None):
  10. """Wrapper around a NLTK SnowballStemmer, which includes stop words for
  11. each language.
  12. Args:
  13. stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
  14. token (lunr.Token): The token to stem.
  15. i (int): The index of the token in a set.
  16. tokens (list): A list of tokens representing the set.
  17. """
  18. def wrapped_stem(token, metadata=None):
  19. return stemmer.stem(token)
  20. return token.update(wrapped_stem)