stemmer_languages.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. from __future__ import unicode_literals
  2. from functools import partial
  3. from lunr.pipeline import Pipeline
  4. # map from ISO-639-1 codes to SnowballStemmer.languages
  5. SUPPORTED_LANGUAGES = {
  6. 'ar': 'arabic',
  7. 'da': 'danish',
  8. 'nl': 'dutch',
  9. 'en': 'english',
  10. 'fi': 'finnish',
  11. 'fr': 'french',
  12. 'de': 'german',
  13. 'hu': 'hungarian',
  14. 'it': 'italian',
  15. 'no': 'norwegian',
  16. 'pt': 'portuguese',
  17. 'ro': 'romanian',
  18. 'ru': 'russian',
  19. 'es': 'spanish',
  20. 'sv': 'swedish'
  21. }
  22. try: # pragma: no cover
  23. from nltk.stem.snowball import SnowballStemmer
  24. LANGUAGE_SUPPORT = True
  25. except ImportError: # pragma: no cover
  26. LANGUAGE_SUPPORT = False
  27. def get_language_stemmer(language):
  28. """Retrieves the SnowballStemmer for a particular language.
  29. Args:
  30. language (str): ISO-639-1 code of the language.
  31. """
  32. return SnowballStemmer(SUPPORTED_LANGUAGES[language])
  33. def nltk_stemmer(stemmer, token, i=None, tokens=None):
  34. """Wrapper around a NLTK SnowballStemmer, which includes stop words for
  35. each language.
  36. Args:
  37. stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
  38. token (lunr.Token): The token to stem.
  39. i (int): The index of the token in a set.
  40. tokens (list): A list of tokens representing the set.
  41. """
  42. def wrapped_stem(token, metadata=None):
  43. return stemmer.stem(token)
  44. return token.update(wrapped_stem)
  45. def register_languages():
  46. """Register all supported languages to ensure compatibility."""
  47. for language in SUPPORTED_LANGUAGES:
  48. language_stemmer = partial(
  49. nltk_stemmer, get_language_stemmer(language))
  50. Pipeline.register_function(
  51. language_stemmer, 'stemmer-{}'.format(language))
  52. if LANGUAGE_SUPPORT: # pragma: no cover
  53. # TODO: registering all possible stemmers feels unnecessary but it solves
  54. # deserializing with arbitrary language functions. Ideally the schema would
  55. # provide the language(s) for the index and we could register the stemmers
  56. # as needed
  57. register_languages()